Blob Blame History Raw
From: jbeulich@novell.com
Subject: eliminate scalability issues from initrd handling
Patch-mainline: n/a

Size restrictions native kernels wouldn't have resulted from the initrd
getting mapped into the initial mapping. The kernel doesn't really need
the initrd to be mapped, so use new infrastructure available in 4.1+
Xen to avoid the mapping and hence the restriction.

--- head.orig/arch/x86/include/mach-xen/asm/setup.h	2011-07-04 12:32:43.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/setup.h	2011-07-04 12:35:19.000000000 +0200
@@ -3,6 +3,13 @@
 void xen_start_kernel(void);
 void xen_arch_setup(void);
 
+#ifdef CONFIG_X86_64
+void reserve_pfn_range(unsigned long pfn, unsigned long nr);
+void reserve_pgtable_low(void);
+#endif
+
+extern unsigned long xen_initrd_start;
+
 #ifdef CONFIG_EFI
 void efi_probe(void);
 #else
--- head.orig/arch/x86/kernel/head-xen.c	2012-02-10 13:43:16.000000000 +0100
+++ head/arch/x86/kernel/head-xen.c	2011-07-11 15:42:00.000000000 +0200
@@ -76,6 +76,8 @@ extern void nmi(void);
 #define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
 #endif
 
+unsigned long __initdata xen_initrd_start;
+
 unsigned long *__read_mostly machine_to_phys_mapping =
 	(void *)MACH2PHYS_VIRT_START;
 EXPORT_SYMBOL(machine_to_phys_mapping);
--- head.orig/arch/x86/kernel/head32-xen.c	2012-02-09 12:32:50.000000000 +0100
+++ head/arch/x86/kernel/head32-xen.c	2012-02-10 14:04:43.000000000 +0100
@@ -75,6 +75,11 @@ void __init i386_start_kernel(void)
 		break;
 	}
 #else
+#ifdef CONFIG_BLK_DEV_INITRD
+	BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN);
+	if (xen_start_info->mod_start)
+		xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
 	{
 		int max_cmdline;
 
--- head.orig/arch/x86/kernel/head64-xen.c	2012-02-10 14:03:06.000000000 +0100
+++ head/arch/x86/kernel/head64-xen.c	2012-02-10 14:04:44.000000000 +0100
@@ -120,11 +120,21 @@ void __init x86_64_start_reservations(ch
 	memblock_reserve(__pa_symbol(&_text),
 			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD if needed. */
+	if (xen_start_info->flags & SIF_MOD_START_PFN) {
+		reserve_pfn_range(xen_start_info->mod_start,
+				  PFN_UP(xen_start_info->mod_len));
+		xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT;
+	} else if (xen_start_info->mod_start)
+		xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
+
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		xen_start_info->mfn_list = ~0UL;
 	else if (xen_start_info->mfn_list < __START_KERNEL_map)
-		memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn),
-				 PFN_PHYS(xen_start_info->nr_p2m_frames));
+		reserve_pfn_range(xen_start_info->first_p2m_pfn,
+				  xen_start_info->nr_p2m_frames);
 
 	/*
 	 * At this point everything still needed from the boot loader
--- head.orig/arch/x86/kernel/head_64-xen.S	2011-08-09 11:19:00.000000000 +0200
+++ head/arch/x86/kernel/head_64-xen.S	2011-08-09 11:19:40.000000000 +0200
@@ -160,6 +160,7 @@ ENTRY(empty_zero_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+	ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
 	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
 						 .ascii "|writable_descriptor_tables";
--- head.orig/arch/x86/kernel/setup-xen.c	2012-05-23 13:48:34.000000000 +0200
+++ head/arch/x86/kernel/setup-xen.c	2012-05-23 13:48:38.000000000 +0200
@@ -466,7 +466,7 @@ static void __init relocate_initrd(void)
 #else
 	printk(KERN_ERR "initrd extends beyond end of memory "
 	       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-	       __pa(xen_start_info->mod_start) + xen_start_info->mod_len,
+	       xen_initrd_start + xen_start_info->mod_len,
 	       max_low_pfn_mapped << PAGE_SHIFT);
 	initrd_start = 0;
 #endif
@@ -485,7 +485,7 @@ static void __init reserve_initrd(void)
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 #else
-	unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+	unsigned long ramdisk_image = xen_initrd_start;
 	unsigned long ramdisk_size  = xen_start_info->mod_len;
 	unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
 	unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
--- head.orig/arch/x86/mm/init-xen.c	2012-04-11 18:02:45.000000000 +0200
+++ head/arch/x86/mm/init-xen.c	2012-04-11 18:05:33.000000000 +0200
@@ -354,14 +354,7 @@ unsigned long __init_refok init_memory_m
 	 */
 	if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
 #ifdef CONFIG_X86_64
-		if (xen_start_info->mfn_list < __START_KERNEL_map
-		    && pgt_buf_start <= xen_start_info->first_p2m_pfn
-		    && pgt_buf_top > xen_start_info->first_p2m_pfn) {
-			x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
-					PFN_PHYS(xen_start_info->first_p2m_pfn));
-			pgt_buf_start = xen_start_info->first_p2m_pfn
-					+ xen_start_info->nr_p2m_frames;
-		}
+		reserve_pgtable_low();
 #endif
 		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
 				PFN_PHYS(pgt_buf_top));
--- head.orig/arch/x86/mm/init_64-xen.c	2012-04-11 18:02:40.000000000 +0200
+++ head/arch/x86/mm/init_64-xen.c	2012-04-11 18:05:36.000000000 +0200
@@ -220,13 +220,70 @@ void sync_global_pgds(unsigned long star
 	}
 }
 
+static struct reserved_pfn_range {
+	unsigned long pfn, nr;
+} reserved_pfn_ranges[3] __meminitdata;
+
+void __init reserve_pfn_range(unsigned long pfn, unsigned long nr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr) {
+			range->pfn = pfn;
+			range->nr = nr;
+			break;
+		}
+		BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr);
+		if (range->pfn > pfn) {
+			i = ARRAY_SIZE(reserved_pfn_ranges) - 1;
+			if (reserved_pfn_ranges[i].nr)
+				continue;
+			for (; reserved_pfn_ranges + i > range; --i)
+				reserved_pfn_ranges[i]
+					 = reserved_pfn_ranges[i - 1];
+			range->pfn = pfn;
+			range->nr = nr;
+			break;
+		}
+	}
+	BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges));
+	memblock_reserve(PFN_PHYS(pfn), PFN_PHYS(nr));
+}
+
+void __init reserve_pgtable_low(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr)
+			break;
+		if (pgt_buf_start <= range->pfn && pgt_buf_top > range->pfn) {
+			x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+					PFN_PHYS(range->pfn));
+			pgt_buf_start = range->pfn + range->nr;
+		}
+	}
+}
+
 static __init unsigned long get_table_end(void)
 {
+	unsigned int i;
+
 	BUG_ON(!pgt_buf_end);
-	if (xen_start_info->mfn_list < __START_KERNEL_map
-	    && pgt_buf_end == xen_start_info->first_p2m_pfn) {
-		pgt_buf_end += xen_start_info->nr_p2m_frames;
-		pgt_buf_top += xen_start_info->nr_p2m_frames;
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr)
+			break;
+		if (pgt_buf_end == range->pfn) {
+			pgt_buf_end += range->nr;
+			pgt_buf_top += range->nr;
+		}
 	}
 	return pgt_buf_end++;
 }
@@ -480,14 +537,25 @@ static inline int __meminit make_readonl
 	    && !max_pfn_mapped
 	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
 		unsigned long top = pgt_buf_top;
+		unsigned int i;
+
+		/* Account for the ranges get_table_end() skips. */
+		for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+			const struct reserved_pfn_range *range;
 
-		/* Account for the range get_table_end() skips. */
-		if (xen_start_info->mfn_list < __START_KERNEL_map
-		    && pgt_buf_end <= xen_start_info->first_p2m_pfn
-		    && top > xen_start_info->first_p2m_pfn)
-			top += xen_start_info->nr_p2m_frames;
+			range = reserved_pfn_ranges + i;
+			if (!range->nr)
+				continue;
+			if (pgt_buf_end <= range->pfn && top > range->pfn) {
+				if (paddr > (range->pfn << PAGE_SHIFT)
+				    && paddr < ((range->pfn + range->nr)
+					        << PAGE_SHIFT))
+					break;
+				top += range->nr;
+			}
+		}
 		if (paddr < (top << PAGE_SHIFT))
-			readonly = 1;
+			readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges));
 	}
 	/* Make old page tables read-only. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
@@ -832,9 +900,6 @@ void __init xen_finish_init_mapping(void
 	    && xen_start_info->mfn_list >= __START_KERNEL_map)
 		phys_to_machine_mapping =
 			__va(__pa(xen_start_info->mfn_list));
-	if (xen_start_info->mod_start)
-		xen_start_info->mod_start = (unsigned long)
-			__va(__pa(xen_start_info->mod_start));
 
 	/* Unpin the no longer used Xen provided page tables. */
 	mmuext.cmd = MMUEXT_UNPIN_TABLE;