| Subject: fix booting with memoryless nodes |
| From: haveblue@us.ibm.com |
| References: 443280 - LTC49675 |
| |
| I've reproduced this on 2.6.27.7. I'm pretty sure it is caused by this |
| patch: |
| |
| http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=8f64e1f2d1e09267ac926e15090fd505c1c0cbcb |
| |
| The problem is that Jon took a loop which was (in psuedocode): |
| |
| for_each_node(nid) |
| NODE_DATA(nid) = careful_alloc(nid); |
| setup_bootmem(nid); |
| reserve_node_bootmem(nid); |
| |
| and broke it up into: |
| |
| for_each_node(nid) |
| NODE_DATA(nid) = careful_alloc(nid); |
| setup_bootmem(nid); |
| for_each_node(nid) |
| reserve_node_bootmem(nid); |
| |
| The issue comes in when the 'careful_alloc()' is called on a node with |
| no memory. It falls back to using bootmem from a previously-initialized |
| node. But, bootmem has not yet been reserved when Jon's patch is |
| applied. It gives back bogus memory (0xc000000000000000) and pukes |
| later in boot. |
| |
| The following patch collapses the loop back together. It also breaks |
| the mark_reserved_regions_for_nid() code out into a function and adds |
| some comments. I think a huge part of introducing this bug is because |
| for loop was too long and hard to read. |
| |
| The actual bug fix here is the: |
| |
| + if (end_pfn <= node->node_start_pfn || |
| + start_pfn >= node_end_pfn) |
| + continue; |
| |
| Signed-off-by: Olaf Hering <olh@suse.de> |
| |
| |
| arch/powerpc/mm/numa.c | 129 ++++++++++++++++++++++++++++++------------------- |
| 1 file changed, 81 insertions(+), 48 deletions(-) |
| |
| |
| |
| @@ -867,10 +867,75 @@ static struct notifier_block __cpuinitda |
| .priority = 1 /* Must run before sched domains notifier. */ |
| }; |
| |
| +static void mark_reserved_regions_for_nid(int nid) |
| +{ |
| + struct pglist_data *node = NODE_DATA(nid); |
| + int i; |
| + |
| + dbg("mark_reserved_regions_for_nid(%d) NODE_DATA: %p\n", nid, node); |
| + for (i = 0; i < lmb.reserved.cnt; i++) { |
| + unsigned long physbase = lmb.reserved.region[i].base; |
| + unsigned long size = lmb.reserved.region[i].size; |
| + unsigned long start_pfn = physbase >> PAGE_SHIFT; |
| + unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); |
| + struct node_active_region node_ar; |
| + unsigned long node_end_pfn = node->node_start_pfn + |
| + node->node_spanned_pages; |
| + |
| + /* |
| + * Check to make sure that this lmb.reserved area is |
| + * within the bounds of the node that we care about. |
| + * Checking the nid of the start and end points is not |
| + * sufficient because the reserved area could span the |
| + * entire node. |
| + */ |
| + if (end_pfn <= node->node_start_pfn || |
| + start_pfn >= node_end_pfn) |
| + continue; |
| + |
| + get_node_active_region(start_pfn, &node_ar); |
| + while (start_pfn < end_pfn && |
| + node_ar.start_pfn < node_ar.end_pfn) { |
| + unsigned long reserve_size = size; |
| + /* |
| + * if reserved region extends past active region |
| + * then trim size to active region |
| + */ |
| + if (end_pfn > node_ar.end_pfn) |
| + reserve_size = (node_ar.end_pfn << PAGE_SHIFT) |
| + - physbase; |
| + /* |
| + * Only worry about *this* node, others may not |
| + * yet have valid NODE_DATA(). |
| + */ |
| + if (node_ar.nid == nid) |
| + reserve_bootmem_node(NODE_DATA(node_ar.nid), |
| + physbase, reserve_size, |
| + BOOTMEM_DEFAULT); |
| + /* |
| + * if reserved region is contained in the active region |
| + * then done. |
| + */ |
| + if (end_pfn <= node_ar.end_pfn) |
| + break; |
| + |
| + /* |
| + * reserved region extends past the active region |
| + * get next active region that contains this |
| + * reserved region |
| + */ |
| + start_pfn = node_ar.end_pfn; |
| + physbase = start_pfn << PAGE_SHIFT; |
| + size = size - reserve_size; |
| + get_node_active_region(start_pfn, &node_ar); |
| + } |
| + } |
| +} |
| + |
| + |
| void __init do_init_bootmem(void) |
| { |
| int nid; |
| - unsigned int i; |
| |
| min_low_pfn = 0; |
| max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; |
| @@ -890,9 +955,16 @@ void __init do_init_bootmem(void) |
| unsigned long bootmem_paddr; |
| unsigned long bootmap_pages; |
| |
| + dbg("node %d is online\n", nid); |
| get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
| |
| - /* Allocate the node structure node local if possible */ |
| + /* |
| + * Allocate the node structure node local if possible |
| + * |
| + * Be careful moving this around, as it relies on all |
| + * previous nodes' bootmem to be initialized and have |
| + * all reserved areas marked. |
| + */ |
| NODE_DATA(nid) = careful_allocation(nid, |
| sizeof(struct pglist_data), |
| SMP_CACHE_BYTES, end_pfn); |
| @@ -924,53 +996,14 @@ void __init do_init_bootmem(void) |
| start_pfn, end_pfn); |
| |
| free_bootmem_with_active_regions(nid, end_pfn); |
| - } |
| - |
| - /* Mark reserved regions */ |
| - for (i = 0; i < lmb.reserved.cnt; i++) { |
| - unsigned long physbase = lmb.reserved.region[i].base; |
| - unsigned long size = lmb.reserved.region[i].size; |
| - unsigned long start_pfn = physbase >> PAGE_SHIFT; |
| - unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); |
| - struct node_active_region node_ar; |
| - |
| - get_node_active_region(start_pfn, &node_ar); |
| - while (start_pfn < end_pfn && |
| - node_ar.start_pfn < node_ar.end_pfn) { |
| - unsigned long reserve_size = size; |
| - /* |
| - * if reserved region extends past active region |
| - * then trim size to active region |
| - */ |
| - if (end_pfn > node_ar.end_pfn) |
| - reserve_size = (node_ar.end_pfn << PAGE_SHIFT) |
| - - (start_pfn << PAGE_SHIFT); |
| - dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, |
| - reserve_size, node_ar.nid); |
| - reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase, |
| - reserve_size, BOOTMEM_DEFAULT); |
| - /* |
| - * if reserved region is contained in the active region |
| - * then done. |
| - */ |
| - if (end_pfn <= node_ar.end_pfn) |
| - break; |
| - |
| - /* |
| - * reserved region extends past the active region |
| - * get next active region that contains this |
| - * reserved region |
| - */ |
| - start_pfn = node_ar.end_pfn; |
| - physbase = start_pfn << PAGE_SHIFT; |
| - size = size - reserve_size; |
| - get_node_active_region(start_pfn, &node_ar); |
| - } |
| - |
| - } |
| - |
| - for_each_online_node(nid) |
| + /* |
| + * Be very careful about moving this around. Future |
| + * calls to careful_allocation() depend on this getting |
| + * done correctly. |
| + */ |
| + mark_reserved_regions_for_nid(nid); |
| sparse_memory_present_with_active_regions(nid); |
| + } |
| } |
| |
| void __init paging_init(void) |