Blob Blame History Raw
Subject: fix booting with memoryless nodes
From: haveblue@us.ibm.com
References: 443280 - LTC49675
I've reproduced this on 2.6.27.7. I'm pretty sure it is caused by this
patch:
http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=8f64e1f2d1e09267ac926e15090fd505c1c0cbcb
The problem is that Jon took a loop which was (in psuedocode):
for_each_node(nid)
NODE_DATA(nid) = careful_alloc(nid);
setup_bootmem(nid);
reserve_node_bootmem(nid);
and broke it up into:
for_each_node(nid)
NODE_DATA(nid) = careful_alloc(nid);
setup_bootmem(nid);
for_each_node(nid)
reserve_node_bootmem(nid);
The issue comes in when the 'careful_alloc()' is called on a node with
no memory. It falls back to using bootmem from a previously-initialized
node. But, bootmem has not yet been reserved when Jon's patch is
applied. It gives back bogus memory (0xc000000000000000) and pukes
later in boot.
The following patch collapses the loop back together. It also breaks
the mark_reserved_regions_for_nid() code out into a function and adds
some comments. I think a huge part of introducing this bug is because
for loop was too long and hard to read.
The actual bug fix here is the:
+ if (end_pfn <= node->node_start_pfn ||
+ start_pfn >= node_end_pfn)
+ continue;
Signed-off-by: Olaf Hering <olh@suse.de>
---
arch/powerpc/mm/numa.c | 129 ++++++++++++++++++++++++++++++-------------------
1 file changed, 81 insertions(+), 48 deletions(-)
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -867,10 +867,75 @@ static struct notifier_block __cpuinitda
.priority = 1 /* Must run before sched domains notifier. */
};
+static void mark_reserved_regions_for_nid(int nid)
+{
+ struct pglist_data *node = NODE_DATA(nid);
+ int i;
+
+ dbg("mark_reserved_regions_for_nid(%d) NODE_DATA: %p\n", nid, node);
+ for (i = 0; i < lmb.reserved.cnt; i++) {
+ unsigned long physbase = lmb.reserved.region[i].base;
+ unsigned long size = lmb.reserved.region[i].size;
+ unsigned long start_pfn = physbase >> PAGE_SHIFT;
+ unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+ struct node_active_region node_ar;
+ unsigned long node_end_pfn = node->node_start_pfn +
+ node->node_spanned_pages;
+
+ /*
+ * Check to make sure that this lmb.reserved area is
+ * within the bounds of the node that we care about.
+ * Checking the nid of the start and end points is not
+ * sufficient because the reserved area could span the
+ * entire node.
+ */
+ if (end_pfn <= node->node_start_pfn ||
+ start_pfn >= node_end_pfn)
+ continue;
+
+ get_node_active_region(start_pfn, &node_ar);
+ while (start_pfn < end_pfn &&
+ node_ar.start_pfn < node_ar.end_pfn) {
+ unsigned long reserve_size = size;
+ /*
+ * if reserved region extends past active region
+ * then trim size to active region
+ */
+ if (end_pfn > node_ar.end_pfn)
+ reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
+ - physbase;
+ /*
+ * Only worry about *this* node, others may not
+ * yet have valid NODE_DATA().
+ */
+ if (node_ar.nid == nid)
+ reserve_bootmem_node(NODE_DATA(node_ar.nid),
+ physbase, reserve_size,
+ BOOTMEM_DEFAULT);
+ /*
+ * if reserved region is contained in the active region
+ * then done.
+ */
+ if (end_pfn <= node_ar.end_pfn)
+ break;
+
+ /*
+ * reserved region extends past the active region
+ * get next active region that contains this
+ * reserved region
+ */
+ start_pfn = node_ar.end_pfn;
+ physbase = start_pfn << PAGE_SHIFT;
+ size = size - reserve_size;
+ get_node_active_region(start_pfn, &node_ar);
+ }
+ }
+}
+
+
void __init do_init_bootmem(void)
{
int nid;
- unsigned int i;
min_low_pfn = 0;
max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
@@ -890,9 +955,16 @@ void __init do_init_bootmem(void)
unsigned long bootmem_paddr;
unsigned long bootmap_pages;
+ dbg("node %d is online\n", nid);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- /* Allocate the node structure node local if possible */
+ /*
+ * Allocate the node structure node local if possible
+ *
+ * Be careful moving this around, as it relies on all
+ * previous nodes' bootmem to be initialized and have
+ * all reserved areas marked.
+ */
NODE_DATA(nid) = careful_allocation(nid,
sizeof(struct pglist_data),
SMP_CACHE_BYTES, end_pfn);
@@ -924,53 +996,14 @@ void __init do_init_bootmem(void)
start_pfn, end_pfn);
free_bootmem_with_active_regions(nid, end_pfn);
- }
-
- /* Mark reserved regions */
- for (i = 0; i < lmb.reserved.cnt; i++) {
- unsigned long physbase = lmb.reserved.region[i].base;
- unsigned long size = lmb.reserved.region[i].size;
- unsigned long start_pfn = physbase >> PAGE_SHIFT;
- unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
- struct node_active_region node_ar;
-
- get_node_active_region(start_pfn, &node_ar);
- while (start_pfn < end_pfn &&
- node_ar.start_pfn < node_ar.end_pfn) {
- unsigned long reserve_size = size;
- /*
- * if reserved region extends past active region
- * then trim size to active region
- */
- if (end_pfn > node_ar.end_pfn)
- reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
- - (start_pfn << PAGE_SHIFT);
- dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
- reserve_size, node_ar.nid);
- reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
- reserve_size, BOOTMEM_DEFAULT);
- /*
- * if reserved region is contained in the active region
- * then done.
- */
- if (end_pfn <= node_ar.end_pfn)
- break;
-
- /*
- * reserved region extends past the active region
- * get next active region that contains this
- * reserved region
- */
- start_pfn = node_ar.end_pfn;
- physbase = start_pfn << PAGE_SHIFT;
- size = size - reserve_size;
- get_node_active_region(start_pfn, &node_ar);
- }
-
- }
-
- for_each_online_node(nid)
+ /*
+ * Be very careful about moving this around. Future
+ * calls to careful_allocation() depend on this getting
+ * done correctly.
+ */
+ mark_reserved_regions_for_nid(nid);
sparse_memory_present_with_active_regions(nid);
+ }
}
void __init paging_init(void)