Jan Beulich 4bfb4a
From: jbeulich@novell.com
Jan Beulich 4bfb4a
Subject: eliminate scalability issues from initial mapping setup
Jan Beulich f4eb20
Patch-mainline: n/a
Jan Beulich 4bfb4a
References: bnc#417417
Jan Beulich 4bfb4a
Jan Beulich 4bfb4a
Direct Xen to place the initial P->M table outside of the initial
Jan Beulich 4bfb4a
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
Jan Beulich 4bfb4a
restriction on the size of the initial mapping limits the amount
Jan Beulich 4bfb4a
of memory a domain can be handed initially.
Jan Beulich 4bfb4a
Jan Beulich 4bfb4a
Note that the flags passed to HYPERVISOR_update_va_mapping() from
Jan Beulich 4bfb4a
__make_page_writable() and make_lowmem_page_writable() are
Jan Beulich 4bfb4a
intentionally not including UVMF_ALL. This is intended to be on optimal
Jan Beulich 4bfb4a
choice between the overhead of a potential spurious page fault (as
Jan Beulich 4bfb4a
remote CPUs may still have read-only translations in their TLBs) and
Jan Beulich 4bfb4a
the overhead of cross processor flushes. Flushing on the local CPU
Jan Beulich 4bfb4a
shouldn't be as expensive (and hence can be viewed as an optimization
Jan Beulich 4bfb4a
avoiding the spurious page fault on the local CPU), but is required
Jan Beulich 4bfb4a
when the functions are used before the page fault handler gets set up.
Jan Beulich 4bfb4a
Jan Beulich f22562
--- head.orig/arch/x86/kernel/head64-xen.c	2012-02-09 12:32:50.000000000 +0100
Jan Beulich f22562
+++ head/arch/x86/kernel/head64-xen.c	2012-02-10 14:03:06.000000000 +0100
Jan Beulich f22562
@@ -120,6 +120,12 @@ void __init x86_64_start_reservations(ch
Jan Beulich f22562
 	memblock_reserve(__pa_symbol(&_text),
Jan Beulich f22562
 			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
+	if (xen_feature(XENFEAT_auto_translated_physmap))
Jan Beulich 4bfb4a
+		xen_start_info->mfn_list = ~0UL;
Jan Beulich 4bfb4a
+	else if (xen_start_info->mfn_list < __START_KERNEL_map)
Jan Beulich f22562
+		memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn),
Jan Beulich f22562
+				 PFN_PHYS(xen_start_info->nr_p2m_frames));
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
 	/*
Jan Beulich 4bfb4a
 	 * At this point everything still needed from the boot loader
Jan Beulich 4bfb4a
 	 * or BIOS or kernel text should be early reserved or marked not
Jan Beulich f22562
--- head.orig/arch/x86/kernel/head_64-xen.S	2011-08-09 11:17:44.000000000 +0200
Jan Beulich f22562
+++ head/arch/x86/kernel/head_64-xen.S	2011-08-09 11:19:00.000000000 +0200
Jan Beulich 48defc
@@ -17,6 +17,7 @@
Jan Beulich 48defc
 #include <linux/elfnote.h>
Jan Beulich 4bfb4a
 #include <asm/segment.h>
Jan Beulich 4bfb4a
 #include <asm/page.h>
Jan Beulich 4bfb4a
+#include <asm/pgtable.h>
Jan Beulich 4bfb4a
 #include <asm/msr.h>
Jan Beulich 4bfb4a
 #include <asm/cache.h>
Jan Beulich 4bfb4a
 #include <asm/dwarf2.h>
Jan Beulich 3692f4
@@ -159,6 +160,7 @@ ENTRY(empty_zero_page)
Jan Beulich 4bfb4a
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
Jan Beulich 4bfb4a
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
Jan Beulich 4bfb4a
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
Jan Beulich 4bfb4a
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
Jan Beulich 3692f4
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
Jan Beulich 3692f4
 						 .ascii "|writable_descriptor_tables";
Jan Beulich 3692f4
 						 .ascii "|auto_translated_physmap";
Jan Beulich a3c5f2
--- head.orig/arch/x86/kernel/setup-xen.c	2012-05-23 13:48:26.000000000 +0200
Jan Beulich 1f9dec
+++ head/arch/x86/kernel/setup-xen.c	2012-06-08 10:49:58.000000000 +0200
Jan Beulich a3c5f2
@@ -1112,6 +1112,54 @@ void __init setup_arch(char **cmdline_p)
Jan Beulich 4c2a76
 	init_gbpages();
Jan Beulich 4c2a76
 
Jan Beulich 4c2a76
 	/* max_pfn_mapped is updated here */
Jan Beulich 4c2a76
+#ifdef CONFIG_X86_64_XEN
Jan Beulich 4c2a76
+	if (xen_start_info->mfn_list < __START_KERNEL_map) {
Jan Beulich 4c2a76
+		/* Map P2M space only after all usable memory. */
Jan Beulich 4c2a76
+		unsigned long p2m_start = xen_start_info->first_p2m_pfn;
Jan Beulich 4c2a76
+		unsigned long p2m_end = p2m_start
Jan Beulich 4c2a76
+					+ xen_start_info->nr_p2m_frames;
Jan Beulich 4c2a76
+		unsigned long temp;
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		max_low_pfn_mapped = init_memory_mapping(
Jan Beulich 4c2a76
+			0, min(max_low_pfn, p2m_start) << PAGE_SHIFT);
Jan Beulich 4c2a76
+		max_pfn_mapped = max_low_pfn_mapped;
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		if (p2m_end < max_low_pfn)
Jan Beulich 4c2a76
+			max_low_pfn_mapped = init_memory_mapping(
Jan Beulich 4c2a76
+				p2m_end << PAGE_SHIFT,
Jan Beulich 4c2a76
+				max_low_pfn << PAGE_SHIFT);
Jan Beulich 4c2a76
+		max_pfn_mapped = max_low_pfn_mapped;
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		if (max_low_pfn < p2m_start)
Jan Beulich 4c2a76
+			max_pfn_mapped = init_memory_mapping(
Jan Beulich 4c2a76
+				max_low_pfn << PAGE_SHIFT,
Jan Beulich 4c2a76
+				p2m_start << PAGE_SHIFT);
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		if (max(max_low_pfn, p2m_end) < max_pfn)
Jan Beulich 4c2a76
+			max_pfn_mapped = init_memory_mapping(
Jan Beulich 4c2a76
+				max(max_low_pfn, p2m_end) << PAGE_SHIFT,
Jan Beulich 4c2a76
+				max_pfn << PAGE_SHIFT);
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		temp = max_pfn_mapped;
Jan Beulich 4c2a76
+		if (p2m_start < max_low_pfn) {
Jan Beulich 4c2a76
+			temp = init_memory_mapping(
Jan Beulich 4c2a76
+				p2m_start << PAGE_SHIFT,
Jan Beulich 4c2a76
+				min(max_low_pfn, p2m_end) << PAGE_SHIFT);
Jan Beulich 4c2a76
+			if (temp > max_low_pfn_mapped)
Jan Beulich 4c2a76
+				max_low_pfn_mapped = temp;
Jan Beulich 4c2a76
+		}
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		if (max_low_pfn < p2m_end)
Jan Beulich 4c2a76
+			temp = init_memory_mapping(
Jan Beulich 4c2a76
+				max(max_low_pfn, p2m_start) << PAGE_SHIFT,
Jan Beulich 4c2a76
+				p2m_end << PAGE_SHIFT);
Jan Beulich 4c2a76
+		if (temp > max_pfn_mapped)
Jan Beulich 4c2a76
+			max_pfn_mapped = temp;
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
+		goto init_memory_mapping_done;
Jan Beulich 4c2a76
+	}
Jan Beulich 4c2a76
+#endif
Jan Beulich 4c2a76
+
Jan Beulich 4c2a76
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<
Jan Beulich 4c2a76
 	max_pfn_mapped = max_low_pfn_mapped;
Jan Beulich 4c2a76
 
Jan Beulich a3c5f2
@@ -1119,6 +1167,7 @@ void __init setup_arch(char **cmdline_p)
Jan Beulich 4c2a76
 	if (max_pfn > max_low_pfn) {
Jan Beulich 4c2a76
 		max_pfn_mapped = init_memory_mapping(1UL<<32,
Jan Beulich 4c2a76
 						     max_pfn<
Jan Beulich 4c2a76
+ init_memory_mapping_done:
Jan Beulich 4c2a76
 		/* can we preseve max_low_pfn ?*/
Jan Beulich 4c2a76
 		max_low_pfn = max_pfn;
Jan Beulich 4c2a76
 	}
Jan Beulich 1f9dec
@@ -1211,7 +1260,7 @@ void __init setup_arch(char **cmdline_p)
Jan Beulich 4bfb4a
 		difference = xen_start_info->nr_pages - max_pfn;
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
 		set_xen_guest_handle(reservation.extent_start,
Jan Beulich 4bfb4a
-				     ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
Jan Beulich 4bfb4a
+				     phys_to_machine_mapping + max_pfn);
Jan Beulich 4bfb4a
 		reservation.nr_extents = difference;
Jan Beulich 4bfb4a
 		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
Jan Beulich 4bfb4a
 					   &reservation);
Jan Beulich 1f9dec
@@ -1224,13 +1273,83 @@ void __init setup_arch(char **cmdline_p)
Jan Beulich 9f943f
 		phys_to_machine_mapping = alloc_bootmem_pages(
Jan Beulich 9f943f
 			max_pfn * sizeof(unsigned long));
Jan Beulich 9f943f
 		memcpy(phys_to_machine_mapping,
Jan Beulich 9f943f
-		       (unsigned long *)xen_start_info->mfn_list,
Jan Beulich 9f943f
+		       __va(__pa(xen_start_info->mfn_list)),
Jan Beulich 9f943f
 		       p2m_pages * sizeof(unsigned long));
Jan Beulich f22562
 		memset(phys_to_machine_mapping + p2m_pages, ~0,
Jan Beulich f22562
 		       (max_pfn - p2m_pages) * sizeof(unsigned long));
Jan Beulich f22562
-		free_bootmem(__pa(xen_start_info->mfn_list),
Jan Beulich f22562
-			     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
Jan Beulich f22562
-					     sizeof(unsigned long))));
Jan Beulich f22562
+#ifdef CONFIG_X86_64
Jan Beulich 4bfb4a
+		if (xen_start_info->mfn_list == VMEMMAP_START) {
Jan Beulich 4bfb4a
+			/*
Jan Beulich 4bfb4a
+			 * Since it is well isolated we can (and since it is
Jan Beulich 4bfb4a
+			 * perhaps large we should) also free the page tables
Jan Beulich 4bfb4a
+			 * mapping the initial P->M table.
Jan Beulich 4bfb4a
+			 */
Jan Beulich 4bfb4a
+			unsigned long va = VMEMMAP_START, pa;
Jan Beulich 4bfb4a
+			pgd_t *pgd = pgd_offset_k(va);
Jan Beulich 4bfb4a
+			pud_t *pud_page = pud_offset(pgd, 0);
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
+			BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
Jan Beulich 4bfb4a
+			xen_l4_entry_update(pgd, __pgd(0));
Jan Beulich 9f943f
+			do {
Jan Beulich 4bfb4a
+				pud_t *pud = pud_page + pud_index(va);
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
+				if (pud_none(*pud))
Jan Beulich 4bfb4a
+					va += PUD_SIZE;
Jan Beulich 4bfb4a
+				else if (pud_large(*pud)) {
Jan Beulich 4bfb4a
+					pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
Jan Beulich 4bfb4a
+					make_pages_writable(__va(pa),
Jan Beulich 4bfb4a
+						PUD_SIZE >> PAGE_SHIFT,
Jan Beulich 4bfb4a
+						XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+					free_bootmem(pa, PUD_SIZE);
Jan Beulich 4bfb4a
+					va += PUD_SIZE;
Jan Beulich 4bfb4a
+				} else {
Jan Beulich 4bfb4a
+					pmd_t *pmd = pmd_offset(pud, va);
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
+					if (pmd_large(*pmd)) {
Jan Beulich 4bfb4a
+						pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
Jan Beulich 4bfb4a
+						make_pages_writable(__va(pa),
Jan Beulich 4bfb4a
+							PMD_SIZE >> PAGE_SHIFT,
Jan Beulich 4bfb4a
+							XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+						free_bootmem(pa, PMD_SIZE);
Jan Beulich 4bfb4a
+					} else if (!pmd_none(*pmd)) {
Jan Beulich b08ea4
+						unsigned int i;
Jan Beulich 4bfb4a
+						pte_t *pte = pte_offset_kernel(pmd, va);
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
+						for (i = 0; i < PTRS_PER_PTE; ++i) {
Jan Beulich 4bfb4a
+							if (pte_none(pte[i]))
Jan Beulich 4bfb4a
+								break;
Jan Beulich 4bfb4a
+							pa = pte_pfn(pte[i]) << PAGE_SHIFT;
Jan Beulich 4bfb4a
+							make_page_writable(__va(pa),
Jan Beulich 4bfb4a
+								XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+							free_bootmem(pa, PAGE_SIZE);
Jan Beulich 4bfb4a
+						}
Jan Beulich 4bfb4a
+						ClearPagePinned(virt_to_page(pte));
Jan Beulich 4bfb4a
+						make_page_writable(pte,
Jan Beulich 4bfb4a
+							XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+						free_bootmem(__pa(pte), PAGE_SIZE);
Jan Beulich 4bfb4a
+					}
Jan Beulich 4bfb4a
+					va += PMD_SIZE;
Jan Beulich 4bfb4a
+					if (pmd_index(va))
Jan Beulich 4bfb4a
+						continue;
Jan Beulich 4bfb4a
+					ClearPagePinned(virt_to_page(pmd));
Jan Beulich 4bfb4a
+					make_page_writable(pmd,
Jan Beulich 4bfb4a
+						XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+					free_bootmem(__pa((unsigned long)pmd
Jan Beulich 4bfb4a
+							  & PAGE_MASK),
Jan Beulich 9f943f
+						     PAGE_SIZE);
Jan Beulich 4bfb4a
+				}
Jan Beulich 9f943f
+			} while (pud_index(va));
Jan Beulich 4bfb4a
+			ClearPagePinned(virt_to_page(pud_page));
Jan Beulich 4bfb4a
+			make_page_writable(pud_page,
Jan Beulich 9f943f
+					   XENFEAT_writable_page_tables);
Jan Beulich 4bfb4a
+			free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
Jan Beulich 9f943f
+				     PAGE_SIZE);
Jan Beulich 4bfb4a
+		} else if (!WARN_ON(xen_start_info->mfn_list
Jan Beulich 4bfb4a
+				    < __START_KERNEL_map))
Jan Beulich 4bfb4a
+#endif
Jan Beulich 4bfb4a
+			free_bootmem(__pa(xen_start_info->mfn_list),
Jan Beulich f22562
+				     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
Jan Beulich f22562
+						     sizeof(unsigned long))));
Jan Beulich 4bfb4a
 
Jan Beulich 8ed7ef
 		if (!is_initial_xendomain() || kexec_enabled())
Jan Beulich 8ed7ef
 			setup_pfn_to_mfn_frame_list(__alloc_bootmem);
Jan Beulich 9f943f
--- head.orig/arch/x86/mm/init-xen.c	2012-04-11 17:13:04.000000000 +0200
Jan Beulich 9f943f
+++ head/arch/x86/mm/init-xen.c	2012-04-11 18:02:45.000000000 +0200
Jan Beulich 9f943f
@@ -352,9 +352,20 @@ unsigned long __init_refok init_memory_m
Jan Beulich 3b6edf
 	 * RO all the pagetable pages, including the ones that are beyond
Jan Beulich 3b6edf
 	 * pgt_buf_end at that time.
Jan Beulich 3b6edf
 	 */
Jan Beulich d3bfd6
-	if (!after_bootmem && pgt_buf_top > pgt_buf_start)
Jan Beulich d3bfd6
+	if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
Jan Beulich 9d5ae8
+#ifdef CONFIG_X86_64
Jan Beulich 9d5ae8
+		if (xen_start_info->mfn_list < __START_KERNEL_map
Jan Beulich d3bfd6
+		    && pgt_buf_start <= xen_start_info->first_p2m_pfn
Jan Beulich d3bfd6
+		    && pgt_buf_top > xen_start_info->first_p2m_pfn) {
Jan Beulich 3b6edf
+			x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
Jan Beulich 3b6edf
+					PFN_PHYS(xen_start_info->first_p2m_pfn));
Jan Beulich d3bfd6
+			pgt_buf_start = xen_start_info->first_p2m_pfn
Jan Beulich d3bfd6
+					+ xen_start_info->nr_p2m_frames;
Jan Beulich 9d5ae8
+		}
Jan Beulich 9d5ae8
+#endif
Jan Beulich 3b6edf
 		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
Jan Beulich 3b6edf
 				PFN_PHYS(pgt_buf_top));
Jan Beulich 9d5ae8
+	}
Jan Beulich 9d5ae8
 
Jan Beulich 9d5ae8
 	if (!after_bootmem)
Jan Beulich 9d5ae8
 		early_memtest(start, end);
Jan Beulich 9f943f
--- head.orig/arch/x86/mm/init_64-xen.c	2012-04-11 17:55:48.000000000 +0200
Jan Beulich 9f943f
+++ head/arch/x86/mm/init_64-xen.c	2012-04-11 18:02:40.000000000 +0200
Jan Beulich 9f943f
@@ -220,6 +220,17 @@ void sync_global_pgds(unsigned long star
Jan Beulich cf6d99
 	}
Jan Beulich 9d5ae8
 }
Jan Beulich 9d5ae8
 
Jan Beulich 9d5ae8
+static __init unsigned long get_table_end(void)
Jan Beulich 4bfb4a
+{
Jan Beulich d3bfd6
+	BUG_ON(!pgt_buf_end);
Jan Beulich 4bfb4a
+	if (xen_start_info->mfn_list < __START_KERNEL_map
Jan Beulich d3bfd6
+	    && pgt_buf_end == xen_start_info->first_p2m_pfn) {
Jan Beulich d3bfd6
+		pgt_buf_end += xen_start_info->nr_p2m_frames;
Jan Beulich d3bfd6
+		pgt_buf_top += xen_start_info->nr_p2m_frames;
Jan Beulich 4bfb4a
+	}
Jan Beulich d3bfd6
+	return pgt_buf_end++;
Jan Beulich 4bfb4a
+}
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
 /*
Jan Beulich 4bfb4a
  * NOTE: This function is marked __ref because it calls __init function
Jan Beulich 4bfb4a
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
Jan Beulich 9f943f
@@ -231,8 +242,7 @@ static __ref void *spp_getpage(void)
Jan Beulich 4bfb4a
 	if (after_bootmem)
Jan Beulich 48defc
 		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
Jan Beulich d3bfd6
 	else if (pgt_buf_end < pgt_buf_top) {
Jan Beulich d3bfd6
-		ptr = __va(pgt_buf_end << PAGE_SHIFT);
Jan Beulich d3bfd6
-		pgt_buf_end++;
Jan Beulich 9d5ae8
+		ptr = __va(get_table_end() << PAGE_SHIFT);
Jan Beulich 683661
 		clear_page(ptr);
Jan Beulich 4bfb4a
 	} else
Jan Beulich 4bfb4a
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
Jan Beulich 9f943f
@@ -427,8 +437,7 @@ static __ref void *alloc_low_page(unsign
Jan Beulich 4bfb4a
 		return adr;
Jan Beulich 4bfb4a
 	}
Jan Beulich 4bfb4a
 
Jan Beulich d3bfd6
-	BUG_ON(!pgt_buf_end);
Jan Beulich d3bfd6
-	pfn = pgt_buf_end++;
Jan Beulich 9d5ae8
+	pfn = get_table_end();
Jan Beulich d3bfd6
 	if (pfn >= pgt_buf_top)
Jan Beulich 4bfb4a
 		panic("alloc_low_page: ran out of memory");
Jan Beulich 4bfb4a
 
Jan Beulich 9f943f
@@ -469,14 +478,29 @@ static inline int __meminit make_readonl
Jan Beulich 4bfb4a
 	/* Make new page tables read-only on the first pass. */
Jan Beulich 4bfb4a
 	if (!xen_feature(XENFEAT_writable_page_tables)
Jan Beulich 4bfb4a
 	    && !max_pfn_mapped
Jan Beulich d3bfd6
-	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))
Jan Beulich d3bfd6
-	    && (paddr < (pgt_buf_top << PAGE_SHIFT)))
Jan Beulich 4bfb4a
-		readonly = 1;
Jan Beulich d3bfd6
+	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
Jan Beulich d3bfd6
+		unsigned long top = pgt_buf_top;
Jan Beulich 4bfb4a
+
Jan Beulich 9d5ae8
+		/* Account for the range get_table_end() skips. */
Jan Beulich 4bfb4a
+		if (xen_start_info->mfn_list < __START_KERNEL_map
Jan Beulich d3bfd6
+		    && pgt_buf_end <= xen_start_info->first_p2m_pfn
Jan Beulich 4bfb4a
+		    && top > xen_start_info->first_p2m_pfn)
Jan Beulich 4bfb4a
+			top += xen_start_info->nr_p2m_frames;
Jan Beulich 4bfb4a
+		if (paddr < (top << PAGE_SHIFT))
Jan Beulich 4bfb4a
+			readonly = 1;
Jan Beulich 4bfb4a
+	}
Jan Beulich 4bfb4a
 	/* Make old page tables read-only. */
Jan Beulich 4bfb4a
 	if (!xen_feature(XENFEAT_writable_page_tables)
Jan Beulich 4bfb4a
 	    && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
Jan Beulich d3bfd6
 	    && (paddr < (pgt_buf_end << PAGE_SHIFT)))
Jan Beulich 4bfb4a
 		readonly = 1;
Jan Beulich 4bfb4a
+	/* Make P->M table (and its page tables) read-only. */
Jan Beulich 4bfb4a
+	if (!xen_feature(XENFEAT_writable_page_tables)
Jan Beulich 4bfb4a
+	    && xen_start_info->mfn_list < __START_KERNEL_map
Jan Beulich 4bfb4a
+	    && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
Jan Beulich 4bfb4a
+	    && paddr < (xen_start_info->first_p2m_pfn
Jan Beulich 4bfb4a
+			+ xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
Jan Beulich 4bfb4a
+		readonly = 1;
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
 	/*
Jan Beulich 4bfb4a
 	 * No need for writable mapping of kernel image. This also ensures that
Jan Beulich 9f943f
@@ -548,7 +572,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned 
Jan Beulich 4c2a76
 
Jan Beulich 4c2a76
 	int i = pmd_index(address);
Jan Beulich 4c2a76
 
Jan Beulich 4c2a76
-	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Jan Beulich 4c2a76
+	for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE) {
Jan Beulich 4c2a76
 		unsigned long pte_phys;
Jan Beulich 4c2a76
 		pmd_t *pmd = pmd_page + pmd_index(address);
Jan Beulich 4c2a76
 		pte_t *pte;
Jan Beulich 9f943f
@@ -760,6 +784,12 @@ void __init xen_init_pt(void)
Jan Beulich 2396a9
 	       (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
Jan Beulich 2396a9
 	       * sizeof(*level3_kernel_pgt));
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
+	/* Copy the initial P->M table mappings if necessary. */
Jan Beulich 4bfb4a
+	addr = pgd_index(xen_start_info->mfn_list);
Jan Beulich 4bfb4a
+	if (addr < pgd_index(__START_KERNEL_map))
Jan Beulich 4bfb4a
+		init_level4_pgt[addr] =
Jan Beulich 4bfb4a
+			((pgd_t *)xen_start_info->pt_base)[addr];
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
 	/* Do an early initialization of the fixmap area. */
Jan Beulich 4bfb4a
 	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
Jan Beulich 2396a9
 	if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
Jan Beulich 9f943f
@@ -791,22 +821,27 @@ void __init xen_init_pt(void)
Jan Beulich 9d5ae8
 void __init xen_finish_init_mapping(void)
Jan Beulich 4bfb4a
 {
Jan Beulich 9d5ae8
 	unsigned long start, end;
Jan Beulich 4bfb4a
+	struct mmuext_op mmuext;
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
 	/* Re-vector virtual addresses pointing into the initial
Jan Beulich 4bfb4a
 	   mapping to the just-established permanent ones. */
Jan Beulich 4bfb4a
 	xen_start_info = __va(__pa(xen_start_info));
Jan Beulich 4bfb4a
 	xen_start_info->pt_base = (unsigned long)
Jan Beulich 4bfb4a
 		__va(__pa(xen_start_info->pt_base));
Jan Beulich 4bfb4a
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
Jan Beulich 4bfb4a
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
Jan Beulich 4bfb4a
+	    && xen_start_info->mfn_list >= __START_KERNEL_map)
Jan Beulich 4bfb4a
 		phys_to_machine_mapping =
Jan Beulich 4bfb4a
 			__va(__pa(xen_start_info->mfn_list));
Jan Beulich 4bfb4a
-		xen_start_info->mfn_list = (unsigned long)
Jan Beulich 4bfb4a
-			phys_to_machine_mapping;
Jan Beulich 4bfb4a
-	}
Jan Beulich 4bfb4a
 	if (xen_start_info->mod_start)
Jan Beulich 4bfb4a
 		xen_start_info->mod_start = (unsigned long)
Jan Beulich 4bfb4a
 			__va(__pa(xen_start_info->mod_start));
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
+	/* Unpin the no longer used Xen provided page tables. */
Jan Beulich 4bfb4a
+	mmuext.cmd = MMUEXT_UNPIN_TABLE;
Jan Beulich ab11d6
+	mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
Jan Beulich 4bfb4a
+	if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
Jan Beulich 4bfb4a
+		BUG();
Jan Beulich 4bfb4a
+
Jan Beulich 4bfb4a
 	/* Destroy the Xen-created mappings beyond the kernel image. */
Jan Beulich fdf2d6
 	start = PAGE_ALIGN(_brk_end);
Jan Beulich d3bfd6
 	end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
Jan Beulich f22562
--- head.orig/arch/x86/mm/pageattr-xen.c	2012-02-09 12:32:50.000000000 +0100
Jan Beulich f22562
+++ head/arch/x86/mm/pageattr-xen.c	2012-02-10 14:03:23.000000000 +0100
Jan Beulich f22562
@@ -1490,7 +1490,7 @@ static void __make_page_writable(unsigne
Jan Beulich 4bfb4a
 
Jan Beulich 4bfb4a
 	pte = lookup_address(va, &level);
Jan Beulich 4bfb4a
 	BUG_ON(!pte || level != PG_LEVEL_4K);
Jan Beulich 4bfb4a
-	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
Jan Beulich 4bfb4a
+	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
Jan Beulich 4bfb4a
 		BUG();
Jan Beulich 4bfb4a
 	if (in_secondary_range(va)) {
Jan Beulich 4bfb4a
 		unsigned long pfn = pte_pfn(*pte);
Jan Beulich f22562
--- head.orig/arch/x86/mm/pgtable-xen.c	2011-04-11 16:14:31.000000000 +0200
Jan Beulich f22562
+++ head/arch/x86/mm/pgtable-xen.c	2011-02-03 14:42:41.000000000 +0100
Jan Beulich 30e8a1
@@ -344,7 +344,7 @@ void __init xen_init_pgd_pin(void)
Jan Beulich 4bfb4a
 		if (PTRS_PER_PUD > 1) /* not folded */
Jan Beulich 4bfb4a
 			SetPagePinned(virt_to_page(pud));
Jan Beulich 4bfb4a
 		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
Jan Beulich 4bfb4a
-			if (!pud_present(*pud))
Jan Beulich 4bfb4a
+			if (!pud_present(*pud) || pud_large(*pud))
Jan Beulich 4bfb4a
 				continue;
Jan Beulich 4bfb4a
 			pmd = pmd_offset(pud, 0);
Jan Beulich 4bfb4a
 			if (PTRS_PER_PMD > 1) /* not folded */
Jan Beulich 30e8a1
@@ -355,7 +355,7 @@ void __init xen_init_pgd_pin(void)
Jan Beulich 4bfb4a
 				    && m >= pmd_index(HYPERVISOR_VIRT_START))
Jan Beulich 4bfb4a
 					continue;
Jan Beulich 4bfb4a
 #endif
Jan Beulich 4bfb4a
-				if (!pmd_present(*pmd))
Jan Beulich 4bfb4a
+				if (!pmd_present(*pmd) || pmd_large(*pmd))
Jan Beulich 4bfb4a
 					continue;
Jan Beulich 4bfb4a
 				SetPagePinned(pmd_page(*pmd));
Jan Beulich 4bfb4a
 			}
Jan Beulich 9f943f
--- head.orig/arch/x86/mm/pgtable_32-xen.c	2012-04-11 13:26:23.000000000 +0200
Jan Beulich f22562
+++ head/arch/x86/mm/pgtable_32-xen.c	2011-02-03 14:42:41.000000000 +0100
Jan Beulich 9f943f
@@ -173,6 +173,6 @@ void make_lowmem_page_writable(void *va,
Jan Beulich 4bfb4a
 	pte = lookup_address((unsigned long)va, &level);
Jan Beulich 4bfb4a
 	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
Jan Beulich 4bfb4a
 	rc = HYPERVISOR_update_va_mapping(
Jan Beulich 4bfb4a
-		(unsigned long)va, pte_mkwrite(*pte), 0);
Jan Beulich 4bfb4a
+		(unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
Jan Beulich 4bfb4a
 	BUG_ON(rc);
Jan Beulich 4bfb4a
 }