Blob Blame History Raw
From: jbeulich@novell.com
Subject: eliminate scalability issues from initial mapping setup
Patch-mainline: n/a
References: bnc#417417

Direct Xen to place the initial P->M table outside of the initial
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
restriction on the size of the initial mapping limits the amount
of memory a domain can be handed initially.

Note that the flags passed to HYPERVISOR_update_va_mapping() from
__make_page_writable() and make_lowmem_page_writable() are
intentionally not including UVMF_ALL. This is intended to be on optimal
choice between the overhead of a potential spurious page fault (as
remote CPUs may still have read-only translations in their TLBs) and
the overhead of cross processor flushes. Flushing on the local CPU
shouldn't be as expensive (and hence can be viewed as an optimization
avoiding the spurious page fault on the local CPU), but is required
when the functions are used before the page fault handler gets set up.

--- head.orig/arch/x86/kernel/head64-xen.c	2014-06-27 10:54:54.000000000 +0200
+++ head/arch/x86/kernel/head64-xen.c	2013-05-10 14:51:39.000000000 +0200
@@ -209,5 +209,11 @@ void __init x86_64_start_reservations(ch
 {
 	copy_bootdata(__va(real_mode_data));
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		xen_start_info->mfn_list = ~0UL;
+	else if (xen_start_info->mfn_list < __START_KERNEL_map)
+		memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn),
+				 PFN_PHYS(xen_start_info->nr_p2m_frames));
+
 	start_kernel();
 }
--- head.orig/arch/x86/kernel/head_64-xen.S	2013-02-20 11:49:23.000000000 +0100
+++ head/arch/x86/kernel/head_64-xen.S	2013-03-27 10:43:51.000000000 +0100
@@ -17,6 +17,7 @@
 #include <linux/elfnote.h>
 #include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
 #include <asm/dwarf2.h>
@@ -188,6 +189,7 @@ NEXT_PAGE(empty_zero_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
 						 .ascii "|writable_descriptor_tables";
 						 .ascii "|auto_translated_physmap";
--- head.orig/arch/x86/kernel/setup-xen.c	2013-12-11 11:25:16.000000000 +0100
+++ head/arch/x86/kernel/setup-xen.c	2015-01-16 12:21:39.000000000 +0100
@@ -1427,7 +1427,7 @@ void __init setup_arch(char **cmdline_p)
 		difference = xen_start_info->nr_pages - max_pfn;
 
 		set_xen_guest_handle(reservation.extent_start,
-				     ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+				     phys_to_machine_mapping + max_pfn);
 		reservation.nr_extents = difference;
 		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 					   &reservation);
@@ -1440,13 +1440,83 @@ void __init setup_arch(char **cmdline_p)
 		phys_to_machine_mapping = alloc_bootmem_pages(
 			max_pfn * sizeof(unsigned long));
 		memcpy(phys_to_machine_mapping,
-		       (unsigned long *)xen_start_info->mfn_list,
+		       __va(__pa(xen_start_info->mfn_list)),
 		       p2m_pages * sizeof(unsigned long));
 		memset(phys_to_machine_mapping + p2m_pages, ~0,
 		       (max_pfn - p2m_pages) * sizeof(unsigned long));
-		free_bootmem(__pa(xen_start_info->mfn_list),
-			     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
-					     sizeof(unsigned long))));
+#ifdef CONFIG_X86_64
+		if (xen_start_info->mfn_list == VMEMMAP_START) {
+			/*
+			 * Since it is well isolated we can (and since it is
+			 * perhaps large we should) also free the page tables
+			 * mapping the initial P->M table.
+			 */
+			unsigned long va = VMEMMAP_START, pa;
+			pgd_t *pgd = pgd_offset_k(va);
+			pud_t *pud_page = pud_offset(pgd, 0);
+
+			BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+			xen_l4_entry_update(pgd, __pgd(0));
+			do {
+				pud_t *pud = pud_page + pud_index(va);
+
+				if (pud_none(*pud))
+					va += PUD_SIZE;
+				else if (pud_large(*pud)) {
+					pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+					make_pages_writable(__va(pa),
+						PUD_SIZE >> PAGE_SHIFT,
+						XENFEAT_writable_page_tables);
+					free_bootmem(pa, PUD_SIZE);
+					va += PUD_SIZE;
+				} else {
+					pmd_t *pmd = pmd_offset(pud, va);
+
+					if (pmd_large(*pmd)) {
+						pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+						make_pages_writable(__va(pa),
+							PMD_SIZE >> PAGE_SHIFT,
+							XENFEAT_writable_page_tables);
+						free_bootmem(pa, PMD_SIZE);
+					} else if (!pmd_none(*pmd)) {
+						unsigned int i;
+						pte_t *pte = pte_offset_kernel(pmd, va);
+
+						for (i = 0; i < PTRS_PER_PTE; ++i) {
+							if (pte_none(pte[i]))
+								break;
+							pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+							make_page_writable(__va(pa),
+								XENFEAT_writable_page_tables);
+							free_bootmem(pa, PAGE_SIZE);
+						}
+						ClearPagePinned(virt_to_page(pte));
+						make_page_writable(pte,
+							XENFEAT_writable_page_tables);
+						free_bootmem(__pa(pte), PAGE_SIZE);
+					}
+					va += PMD_SIZE;
+					if (pmd_index(va))
+						continue;
+					ClearPagePinned(virt_to_page(pmd));
+					make_page_writable(pmd,
+						XENFEAT_writable_page_tables);
+					free_bootmem(__pa((unsigned long)pmd
+							  & PAGE_MASK),
+						     PAGE_SIZE);
+				}
+			} while (pud_index(va) | pmd_index(va));
+			ClearPagePinned(virt_to_page(pud_page));
+			make_page_writable(pud_page,
+					   XENFEAT_writable_page_tables);
+			free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+				     PAGE_SIZE);
+		} else if (!WARN_ON(xen_start_info->mfn_list
+				    < __START_KERNEL_map))
+#endif
+			free_bootmem(__pa_symbol(xen_start_info->mfn_list),
+				     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+						     sizeof(unsigned long))));
 
 		if (!is_initial_xendomain() || kexec_enabled())
 			setup_pfn_to_mfn_frame_list(__alloc_bootmem);
--- head.orig/arch/x86/mm/init_64-xen.c	2015-01-14 15:16:29.000000000 +0100
+++ head/arch/x86/mm/init_64-xen.c	2015-01-21 14:48:35.000000000 +0100
@@ -524,6 +524,13 @@ static inline int __meminit make_readonl
 			 PFN_PHYS(xen_start_info->nr_pt_frames) -
 			 __START_KERNEL_map)))
 		readonly = 1;
+	/* Make P->M table (and its page tables) read-only. */
+	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && xen_start_info->mfn_list < __START_KERNEL_map
+	    && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+	    && paddr < (xen_start_info->first_p2m_pfn
+			+ xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+		readonly = 1;
 
 	/*
 	 * No need for writable mapping of kernel image. This also ensures that
@@ -815,6 +822,12 @@ void __init xen_init_pt(void)
 	       (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
 	       * sizeof(*level3_kernel_pgt));
 
+	/* Copy the initial P->M table mappings if necessary. */
+	addr = pgd_index(xen_start_info->mfn_list);
+	if (addr < pgd_index(__START_KERNEL_map))
+		init_level4_pgt[addr] =
+			((pgd_t *)xen_start_info->pt_base)[addr];
+
 	/* Do an early initialization of the fixmap area. */
 	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
 	if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
@@ -902,6 +915,7 @@ void __init xen_init_pt(void)
 void __init xen_finish_init_mapping(void)
 {
 	unsigned long va;
+	struct mmuext_op mmuext;
 	pud_t *pud;
 
 	/* Re-vector virtual addresses pointing into the initial
@@ -909,16 +923,20 @@ void __init xen_finish_init_mapping(void
 	xen_start_info = __va(__pa(xen_start_info));
 	xen_start_info->pt_base = (unsigned long)
 		__va(__pa(xen_start_info->pt_base));
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
+	    && xen_start_info->mfn_list >= __START_KERNEL_map)
 		phys_to_machine_mapping =
-			__va(__pa(xen_start_info->mfn_list));
-		xen_start_info->mfn_list = (unsigned long)
-			phys_to_machine_mapping;
-	}
+			__va(__pa_symbol(xen_start_info->mfn_list));
 	if (xen_start_info->mod_start)
 		xen_start_info->mod_start = (unsigned long)
 			__va(__pa(xen_start_info->mod_start));
 
+	/* Unpin the no longer used Xen provided page tables. */
+	mmuext.cmd = MMUEXT_UNPIN_TABLE;
+	mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
+	if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+		BUG();
+
 	/* Kill mapping of memory below _text. */
 	va = __START_KERNEL_map;
 	while (va < (unsigned long)&_text) {
--- head.orig/arch/x86/mm/pageattr-xen.c	2015-01-13 16:18:07.000000000 +0100
+++ head/arch/x86/mm/pageattr-xen.c	2013-05-27 14:15:46.000000000 +0200
@@ -2049,7 +2049,7 @@ static void __make_page_writable(unsigne
 
 	pte = lookup_address(va, &level);
 	BUG_ON(!pte || level != PG_LEVEL_4K);
-	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
 		BUG();
 	if (in_secondary_range(va)) {
 		unsigned long pfn = pte_pfn(*pte);
--- head.orig/arch/x86/mm/pgtable-xen.c	2013-12-11 11:58:07.000000000 +0100
+++ head/arch/x86/mm/pgtable-xen.c	2013-12-11 11:58:27.000000000 +0100
@@ -362,7 +362,7 @@ void __init xen_init_pgd_pin(void)
 		if (PTRS_PER_PUD > 1) /* not folded */
 			SetPagePinned(virt_to_page(pud));
 		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-			if (!pud_present(*pud))
+			if (!pud_present(*pud) || pud_large(*pud))
 				continue;
 			pmd = pmd_offset(pud, 0);
 			if (PTRS_PER_PMD > 1) /* not folded */
@@ -373,7 +373,7 @@ void __init xen_init_pgd_pin(void)
 				    && m >= pmd_index(HYPERVISOR_VIRT_START))
 					continue;
 #endif
-				if (!pmd_present(*pmd))
+				if (!pmd_present(*pmd) || pmd_large(*pmd))
 					continue;
 				SetPagePinned(pmd_page(*pmd));
 			}
--- head.orig/arch/x86/mm/pgtable_32-xen.c	2014-10-31 13:43:42.000000000 +0100
+++ head/arch/x86/mm/pgtable_32-xen.c	2011-02-03 14:42:41.000000000 +0100
@@ -138,6 +138,6 @@ void make_lowmem_page_writable(void *va,
 	pte = lookup_address((unsigned long)va, &level);
 	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
 	rc = HYPERVISOR_update_va_mapping(
-		(unsigned long)va, pte_mkwrite(*pte), 0);
+		(unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
 	BUG_ON(rc);
 }