Blob Blame History Raw
From: jbeulich@novell.com
Subject: eliminate scalability issues from initial mapping setup
Patch-mainline: n/a
References: bnc#417417

Direct Xen to place the initial P->M table outside of the initial
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
restriction on the size of the initial mapping limits the amount
of memory a domain can be handed initially.

Note that the flags passed to HYPERVISOR_update_va_mapping() from
__make_page_writable() and make_lowmem_page_writable() are
intentionally not including UVMF_ALL. This is intended to be on optimal
choice between the overhead of a potential spurious page fault (as
remote CPUs may still have read-only translations in their TLBs) and
the overhead of cross processor flushes. Flushing on the local CPU
shouldn't be as expensive (and hence can be viewed as an optimization
avoiding the spurious page fault on the local CPU), but is required
when the functions are used before the page fault handler gets set up.

--- head.orig/arch/x86/kernel/head64-xen.c	2012-02-09 12:32:50.000000000 +0100
+++ head/arch/x86/kernel/head64-xen.c	2012-02-10 14:03:06.000000000 +0100
@@ -120,6 +120,12 @@ void __init x86_64_start_reservations(ch
 	memblock_reserve(__pa_symbol(&_text),
 			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		xen_start_info->mfn_list = ~0UL;
+	else if (xen_start_info->mfn_list < __START_KERNEL_map)
+		memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn),
+				 PFN_PHYS(xen_start_info->nr_p2m_frames));
+
 	/*
 	 * At this point everything still needed from the boot loader
 	 * or BIOS or kernel text should be early reserved or marked not
--- head.orig/arch/x86/kernel/head_64-xen.S	2011-08-09 11:17:44.000000000 +0200
+++ head/arch/x86/kernel/head_64-xen.S	2011-08-09 11:19:00.000000000 +0200
@@ -17,6 +17,7 @@
 #include <linux/elfnote.h>
 #include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
 #include <asm/dwarf2.h>
@@ -159,6 +160,7 @@ ENTRY(empty_zero_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
 						 .ascii "|writable_descriptor_tables";
 						 .ascii "|auto_translated_physmap";
--- head.orig/arch/x86/kernel/setup-xen.c	2012-05-23 13:48:26.000000000 +0200
+++ head/arch/x86/kernel/setup-xen.c	2012-06-08 10:49:58.000000000 +0200
@@ -1112,6 +1112,54 @@ void __init setup_arch(char **cmdline_p)
 	init_gbpages();
 
 	/* max_pfn_mapped is updated here */
+#ifdef CONFIG_X86_64_XEN
+	if (xen_start_info->mfn_list < __START_KERNEL_map) {
+		/* Map P2M space only after all usable memory. */
+		unsigned long p2m_start = xen_start_info->first_p2m_pfn;
+		unsigned long p2m_end = p2m_start
+					+ xen_start_info->nr_p2m_frames;
+		unsigned long temp;
+
+		max_low_pfn_mapped = init_memory_mapping(
+			0, min(max_low_pfn, p2m_start) << PAGE_SHIFT);
+		max_pfn_mapped = max_low_pfn_mapped;
+
+		if (p2m_end < max_low_pfn)
+			max_low_pfn_mapped = init_memory_mapping(
+				p2m_end << PAGE_SHIFT,
+				max_low_pfn << PAGE_SHIFT);
+		max_pfn_mapped = max_low_pfn_mapped;
+
+		if (max_low_pfn < p2m_start)
+			max_pfn_mapped = init_memory_mapping(
+				max_low_pfn << PAGE_SHIFT,
+				p2m_start << PAGE_SHIFT);
+
+		if (max(max_low_pfn, p2m_end) < max_pfn)
+			max_pfn_mapped = init_memory_mapping(
+				max(max_low_pfn, p2m_end) << PAGE_SHIFT,
+				max_pfn << PAGE_SHIFT);
+
+		temp = max_pfn_mapped;
+		if (p2m_start < max_low_pfn) {
+			temp = init_memory_mapping(
+				p2m_start << PAGE_SHIFT,
+				min(max_low_pfn, p2m_end) << PAGE_SHIFT);
+			if (temp > max_low_pfn_mapped)
+				max_low_pfn_mapped = temp;
+		}
+
+		if (max_low_pfn < p2m_end)
+			temp = init_memory_mapping(
+				max(max_low_pfn, p2m_start) << PAGE_SHIFT,
+				p2m_end << PAGE_SHIFT);
+		if (temp > max_pfn_mapped)
+			max_pfn_mapped = temp;
+
+		goto init_memory_mapping_done;
+	}
+#endif
+
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
 
@@ -1119,6 +1167,7 @@ void __init setup_arch(char **cmdline_p)
 	if (max_pfn > max_low_pfn) {
 		max_pfn_mapped = init_memory_mapping(1UL<<32,
 						     max_pfn<<PAGE_SHIFT);
+ init_memory_mapping_done:
 		/* can we preseve max_low_pfn ?*/
 		max_low_pfn = max_pfn;
 	}
@@ -1211,7 +1260,7 @@ void __init setup_arch(char **cmdline_p)
 		difference = xen_start_info->nr_pages - max_pfn;
 
 		set_xen_guest_handle(reservation.extent_start,
-				     ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+				     phys_to_machine_mapping + max_pfn);
 		reservation.nr_extents = difference;
 		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 					   &reservation);
@@ -1224,13 +1273,83 @@ void __init setup_arch(char **cmdline_p)
 		phys_to_machine_mapping = alloc_bootmem_pages(
 			max_pfn * sizeof(unsigned long));
 		memcpy(phys_to_machine_mapping,
-		       (unsigned long *)xen_start_info->mfn_list,
+		       __va(__pa(xen_start_info->mfn_list)),
 		       p2m_pages * sizeof(unsigned long));
 		memset(phys_to_machine_mapping + p2m_pages, ~0,
 		       (max_pfn - p2m_pages) * sizeof(unsigned long));
-		free_bootmem(__pa(xen_start_info->mfn_list),
-			     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
-					     sizeof(unsigned long))));
+#ifdef CONFIG_X86_64
+		if (xen_start_info->mfn_list == VMEMMAP_START) {
+			/*
+			 * Since it is well isolated we can (and since it is
+			 * perhaps large we should) also free the page tables
+			 * mapping the initial P->M table.
+			 */
+			unsigned long va = VMEMMAP_START, pa;
+			pgd_t *pgd = pgd_offset_k(va);
+			pud_t *pud_page = pud_offset(pgd, 0);
+
+			BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+			xen_l4_entry_update(pgd, __pgd(0));
+			do {
+				pud_t *pud = pud_page + pud_index(va);
+
+				if (pud_none(*pud))
+					va += PUD_SIZE;
+				else if (pud_large(*pud)) {
+					pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+					make_pages_writable(__va(pa),
+						PUD_SIZE >> PAGE_SHIFT,
+						XENFEAT_writable_page_tables);
+					free_bootmem(pa, PUD_SIZE);
+					va += PUD_SIZE;
+				} else {
+					pmd_t *pmd = pmd_offset(pud, va);
+
+					if (pmd_large(*pmd)) {
+						pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+						make_pages_writable(__va(pa),
+							PMD_SIZE >> PAGE_SHIFT,
+							XENFEAT_writable_page_tables);
+						free_bootmem(pa, PMD_SIZE);
+					} else if (!pmd_none(*pmd)) {
+						unsigned int i;
+						pte_t *pte = pte_offset_kernel(pmd, va);
+
+						for (i = 0; i < PTRS_PER_PTE; ++i) {
+							if (pte_none(pte[i]))
+								break;
+							pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+							make_page_writable(__va(pa),
+								XENFEAT_writable_page_tables);
+							free_bootmem(pa, PAGE_SIZE);
+						}
+						ClearPagePinned(virt_to_page(pte));
+						make_page_writable(pte,
+							XENFEAT_writable_page_tables);
+						free_bootmem(__pa(pte), PAGE_SIZE);
+					}
+					va += PMD_SIZE;
+					if (pmd_index(va))
+						continue;
+					ClearPagePinned(virt_to_page(pmd));
+					make_page_writable(pmd,
+						XENFEAT_writable_page_tables);
+					free_bootmem(__pa((unsigned long)pmd
+							  & PAGE_MASK),
+						     PAGE_SIZE);
+				}
+			} while (pud_index(va));
+			ClearPagePinned(virt_to_page(pud_page));
+			make_page_writable(pud_page,
+					   XENFEAT_writable_page_tables);
+			free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+				     PAGE_SIZE);
+		} else if (!WARN_ON(xen_start_info->mfn_list
+				    < __START_KERNEL_map))
+#endif
+			free_bootmem(__pa(xen_start_info->mfn_list),
+				     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+						     sizeof(unsigned long))));
 
 		if (!is_initial_xendomain() || kexec_enabled())
 			setup_pfn_to_mfn_frame_list(__alloc_bootmem);
--- head.orig/arch/x86/mm/init-xen.c	2012-04-11 17:13:04.000000000 +0200
+++ head/arch/x86/mm/init-xen.c	2012-04-11 18:02:45.000000000 +0200
@@ -352,9 +352,20 @@ unsigned long __init_refok init_memory_m
 	 * RO all the pagetable pages, including the ones that are beyond
 	 * pgt_buf_end at that time.
 	 */
-	if (!after_bootmem && pgt_buf_top > pgt_buf_start)
+	if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
+#ifdef CONFIG_X86_64
+		if (xen_start_info->mfn_list < __START_KERNEL_map
+		    && pgt_buf_start <= xen_start_info->first_p2m_pfn
+		    && pgt_buf_top > xen_start_info->first_p2m_pfn) {
+			x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+					PFN_PHYS(xen_start_info->first_p2m_pfn));
+			pgt_buf_start = xen_start_info->first_p2m_pfn
+					+ xen_start_info->nr_p2m_frames;
+		}
+#endif
 		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
 				PFN_PHYS(pgt_buf_top));
+	}
 
 	if (!after_bootmem)
 		early_memtest(start, end);
--- head.orig/arch/x86/mm/init_64-xen.c	2012-04-11 17:55:48.000000000 +0200
+++ head/arch/x86/mm/init_64-xen.c	2012-04-11 18:02:40.000000000 +0200
@@ -220,6 +220,17 @@ void sync_global_pgds(unsigned long star
 	}
 }
 
+static __init unsigned long get_table_end(void)
+{
+	BUG_ON(!pgt_buf_end);
+	if (xen_start_info->mfn_list < __START_KERNEL_map
+	    && pgt_buf_end == xen_start_info->first_p2m_pfn) {
+		pgt_buf_end += xen_start_info->nr_p2m_frames;
+		pgt_buf_top += xen_start_info->nr_p2m_frames;
+	}
+	return pgt_buf_end++;
+}
+
 /*
  * NOTE: This function is marked __ref because it calls __init function
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -231,8 +242,7 @@ static __ref void *spp_getpage(void)
 	if (after_bootmem)
 		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
 	else if (pgt_buf_end < pgt_buf_top) {
-		ptr = __va(pgt_buf_end << PAGE_SHIFT);
-		pgt_buf_end++;
+		ptr = __va(get_table_end() << PAGE_SHIFT);
 		clear_page(ptr);
 	} else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -427,8 +437,7 @@ static __ref void *alloc_low_page(unsign
 		return adr;
 	}
 
-	BUG_ON(!pgt_buf_end);
-	pfn = pgt_buf_end++;
+	pfn = get_table_end();
 	if (pfn >= pgt_buf_top)
 		panic("alloc_low_page: ran out of memory");
 
@@ -469,14 +478,29 @@ static inline int __meminit make_readonl
 	/* Make new page tables read-only on the first pass. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
 	    && !max_pfn_mapped
-	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))
-	    && (paddr < (pgt_buf_top << PAGE_SHIFT)))
-		readonly = 1;
+	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
+		unsigned long top = pgt_buf_top;
+
+		/* Account for the range get_table_end() skips. */
+		if (xen_start_info->mfn_list < __START_KERNEL_map
+		    && pgt_buf_end <= xen_start_info->first_p2m_pfn
+		    && top > xen_start_info->first_p2m_pfn)
+			top += xen_start_info->nr_p2m_frames;
+		if (paddr < (top << PAGE_SHIFT))
+			readonly = 1;
+	}
 	/* Make old page tables read-only. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
 	    && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
 	    && (paddr < (pgt_buf_end << PAGE_SHIFT)))
 		readonly = 1;
+	/* Make P->M table (and its page tables) read-only. */
+	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && xen_start_info->mfn_list < __START_KERNEL_map
+	    && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+	    && paddr < (xen_start_info->first_p2m_pfn
+			+ xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+		readonly = 1;
 
 	/*
 	 * No need for writable mapping of kernel image. This also ensures that
@@ -548,7 +572,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
 
 	int i = pmd_index(address);
 
-	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+	for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE) {
 		unsigned long pte_phys;
 		pmd_t *pmd = pmd_page + pmd_index(address);
 		pte_t *pte;
@@ -760,6 +784,12 @@ void __init xen_init_pt(void)
 	       (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
 	       * sizeof(*level3_kernel_pgt));
 
+	/* Copy the initial P->M table mappings if necessary. */
+	addr = pgd_index(xen_start_info->mfn_list);
+	if (addr < pgd_index(__START_KERNEL_map))
+		init_level4_pgt[addr] =
+			((pgd_t *)xen_start_info->pt_base)[addr];
+
 	/* Do an early initialization of the fixmap area. */
 	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
 	if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
@@ -791,22 +821,27 @@ void __init xen_init_pt(void)
 void __init xen_finish_init_mapping(void)
 {
 	unsigned long start, end;
+	struct mmuext_op mmuext;
 
 	/* Re-vector virtual addresses pointing into the initial
 	   mapping to the just-established permanent ones. */
 	xen_start_info = __va(__pa(xen_start_info));
 	xen_start_info->pt_base = (unsigned long)
 		__va(__pa(xen_start_info->pt_base));
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
+	    && xen_start_info->mfn_list >= __START_KERNEL_map)
 		phys_to_machine_mapping =
 			__va(__pa(xen_start_info->mfn_list));
-		xen_start_info->mfn_list = (unsigned long)
-			phys_to_machine_mapping;
-	}
 	if (xen_start_info->mod_start)
 		xen_start_info->mod_start = (unsigned long)
 			__va(__pa(xen_start_info->mod_start));
 
+	/* Unpin the no longer used Xen provided page tables. */
+	mmuext.cmd = MMUEXT_UNPIN_TABLE;
+	mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
+	if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+		BUG();
+
 	/* Destroy the Xen-created mappings beyond the kernel image. */
 	start = PAGE_ALIGN(_brk_end);
 	end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
--- head.orig/arch/x86/mm/pageattr-xen.c	2012-02-09 12:32:50.000000000 +0100
+++ head/arch/x86/mm/pageattr-xen.c	2012-02-10 14:03:23.000000000 +0100
@@ -1490,7 +1490,7 @@ static void __make_page_writable(unsigne
 
 	pte = lookup_address(va, &level);
 	BUG_ON(!pte || level != PG_LEVEL_4K);
-	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
 		BUG();
 	if (in_secondary_range(va)) {
 		unsigned long pfn = pte_pfn(*pte);
--- head.orig/arch/x86/mm/pgtable-xen.c	2011-04-11 16:14:31.000000000 +0200
+++ head/arch/x86/mm/pgtable-xen.c	2011-02-03 14:42:41.000000000 +0100
@@ -344,7 +344,7 @@ void __init xen_init_pgd_pin(void)
 		if (PTRS_PER_PUD > 1) /* not folded */
 			SetPagePinned(virt_to_page(pud));
 		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-			if (!pud_present(*pud))
+			if (!pud_present(*pud) || pud_large(*pud))
 				continue;
 			pmd = pmd_offset(pud, 0);
 			if (PTRS_PER_PMD > 1) /* not folded */
@@ -355,7 +355,7 @@ void __init xen_init_pgd_pin(void)
 				    && m >= pmd_index(HYPERVISOR_VIRT_START))
 					continue;
 #endif
-				if (!pmd_present(*pmd))
+				if (!pmd_present(*pmd) || pmd_large(*pmd))
 					continue;
 				SetPagePinned(pmd_page(*pmd));
 			}
--- head.orig/arch/x86/mm/pgtable_32-xen.c	2012-04-11 13:26:23.000000000 +0200
+++ head/arch/x86/mm/pgtable_32-xen.c	2011-02-03 14:42:41.000000000 +0100
@@ -173,6 +173,6 @@ void make_lowmem_page_writable(void *va,
 	pte = lookup_address((unsigned long)va, &level);
 	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
 	rc = HYPERVISOR_update_va_mapping(
-		(unsigned long)va, pte_mkwrite(*pte), 0);
+		(unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
 	BUG_ON(rc);
 }