Blob Blame History Raw
From: Michal Hocko <mhocko@suse.com>
Subject: mm: __init_single_page do not zero core struct pages
Patch-mainline: no, fixed differently
References: bnc#1123124

This is a follow up for d0dc12e86b31 ("mm/memory_hotplug: optimize memory
hotplug") backport which is not pulling many other changes in the memmaps
initialization area. Most notably f7f99100d8d9 ("mm: stop zeroing memory during
allocation in vmemmap") because this has caused a lot of regressions upstream
and the situation still settles down. So a decision has been made to only
bring speed ups affecting the memory hotplug.

While pulling d0dc12e86b31 __init_single_page has started to zero out all
struct pages unconditionally. This should be ok because nobody should be
accessing struct pages which haven't gone the full initialization yet.
Except this doesn't hold for some XEN pinned page table pages and our backport
has caused an early boot regression with the following crash
[    0.004000] BUG: unable to handle kernel paging request at ffff8807d63e9008
[    0.004000] IP: xen_set_pud+0x46/0xb0                             │
[    0.004000] PGD 200c067 P4D 200c067 PUD 3294067 PMD 7da358067 PTE 80100007d63e9065
[    0.004000] Oops: 0003 [#1] SMP NOPTI                             │
[    0.004000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.12.14-29-default #1 SLE15
[    0.004000] Hardware name: Dell Inc. PowerEdge R230/0FRVY0, BIOS 2.4.3 01/31/2018
[    0.004000] task: ffffffff820134c0 task.stack: ffffffff82000000   │
[    0.004000] RIP: e030:xen_set_pud+0x46/0xb0                       │
[    0.004000] RSP: e02b:ffffffff82003cf0 EFLAGS: 00010246           │
[    0.004000] RAX: 0017ffffc0000800 RBX: ffff880207d21000 RCX: 00003ffffffff000
[    0.004000] RDX: 000077ff80000000 RSI: 00000006d1d72067 RDI: ffff8807d63e9008
[    0.004000] RBP: ffff8807d63e9008 R08: 0000000000000000 R09: ffffea00061f4880
[    0.004000] R10: 0000000000000000 R11: ffff88084ca280c0 R12: 00000006d1d72067
[    0.004000] R13: ffffffff82119534 R14: 0000000000000000 R15: ffffc90040000000
[    0.004000] FS:  0000000000000000(0000) GS:ffff88084ca00000(0000) knlGS:0000000000000000
[    0.004000] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.004000] CR2: ffff8807d63e9008 CR3: 000000000200a000 CR4: 0000000000042660
[    0.004000] Call Trace:isn't found.
[    0.004000]  __pmd_alloc+0x113/0x170
[    0.004000]  ? acpi_os_map_iomem+0x14c/0x180
[    0.004000]  ioremap_page_range+0x415/0x460
[    0.004000]  ? __get_vm_area_node+0xcb/0x130
[    0.004000]  ? acpi_os_map_iomem+0x14c/0x180
[    0.004000]  __ioremap_caller+0x20a/0x2e0
[    0.004000]  acpi_os_map_iomem+0x14c/0x180
[    0.004000]  acpi_tb_acquire_table+0x39/0x64
[    0.004000]  acpi_tb_validate_table+0x44/0x7c
[    0.004000]  acpi_tb_verify_temp_table+0x42/0x2ff
[    0.004000]  ? acpi_ut_acquire_mutex+0x12a/0x1bf
[    0.004000]  acpi_reallocate_root_table+0x12f/0x148
[    0.004000]  acpi_early_init+0x4b/0x102
[    0.004000]  start_kernel+0x3b0/0x446
[    0.004000]  xen_start_kernel+0x535/0x53f

This is a protection fault when a read only pte is attempted to be written.
Jurgen exaplains:
: When a pagetable entry is to be written the "Pinned" flag of the related struct
: page is tested and when it is zero the entry is written directly instead via
: the hypervisor.  So in our case the pagetable isn't flagged as Pinned, hence
: the protection fault

This means that a _somebody_ (and we didn't manage to find out who and how)
managed to get access to a struct page, set a pin for it but the struct page
initialization code (deferred_init_memmap, memmap_init_zone or
init_reserved_page) later scribbles over that struct page and destroys the
pinned state.

A proper fix would be to find the user which is relying on an uninitialized struct
page but we were not able to do so. Maybe there is some initialization ordering
issue. For now workaround the issue bu removing struct page zeroying from
__init_single_page for anything but hotplugged pages. The core memory memmap
code already zeroes the full memmap during allocation (vmemmap_alloc_block)
so we should be safe here. Memory hotplug memmaps are zeroed at that stage
as well but we always have to ensure that the struct page state is clean
during onlining.

Signed-off-by: Michal Hocko <mhocko@suse.com>

---
 mm/page_alloc.c |   13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1211,9 +1211,10 @@ static void free_one_page(struct zone *z
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-				unsigned long zone, int nid)
+				unsigned long zone, int nid, bool zero)
 {
-	memset(page, 0, sizeof(*page));
+	if (zero)
+		memset(page, 0, sizeof(*page));
 	set_page_links(page, zone, nid, pfn);
 	init_page_count(page);
 	page_mapcount_reset(page);
@@ -1230,7 +1231,7 @@ static void __meminit __init_single_page
 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
 					int nid)
 {
-	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, false);
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1578,7 +1579,7 @@ static int __init deferred_init_memmap(v
 				goto free_range;
 			}
 
-			__init_single_page(page, pfn, zid, nid);
+			__init_single_page(page, pfn, zid, nid, false);
 			if (!free_base_page) {
 				free_base_page = page;
 				free_base_pfn = pfn;
@@ -5589,7 +5590,7 @@ void __meminit memmap_init_zone(unsigned
 
 not_early:
 		page = pfn_to_page(pfn);
-		__init_single_page(page, pfn, zone, nid);
+		__init_single_page(page, pfn, zone, nid, context == MEMMAP_HOTPLUG);
 		if (context == MEMMAP_HOTPLUG)
 			__SetPageReserved(page);
 
@@ -5642,7 +5643,7 @@ void __ref memmap_init_zone_device(struc
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 
-		__init_single_page(page, pfn, zone_idx, nid);
+		__init_single_page(page, pfn, zone_idx, nid, true);
 
 		/*
 		 * ZONE_DEVICE pages union ->lru with a ->pgmap back