Blob Blame History Raw
From: jbeulich@novell.com
Subject: don't require order-1 allocations for pgd-s
Patch-mainline: n/a

At the same time remove the useless user mode pair of init_level4_pgt.

--- head.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2012-05-31 14:50:21.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/hypervisor.h	2012-05-31 14:50:27.000000000 +0200
@@ -101,8 +101,8 @@ void do_hypervisor_callback(struct pt_re
  * be MACHINE addresses.
  */
 
-void xen_pt_switch(unsigned long ptr);
-void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
+void xen_pt_switch(pgd_t *);
+void xen_new_user_pt(pgd_t *); /* x86_64 only */
 void xen_load_gs(unsigned int selector); /* x86_64 only */
 void xen_tlb_flush(void);
 void xen_invlpg(unsigned long ptr);
@@ -110,7 +110,7 @@ void xen_invlpg(unsigned long ptr);
 void xen_l1_entry_update(pte_t *ptr, pte_t val);
 void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
 void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
-void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val); /* x86_64 only */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
 void xen_pgd_pin(pgd_t *);
 void xen_pgd_unpin(pgd_t *);
 
--- head.orig/arch/x86/include/mach-xen/asm/mmu_context.h	2011-09-08 16:54:08.000000000 +0200
+++ head/arch/x86/include/mach-xen/asm/mmu_context.h	2011-02-08 10:46:27.000000000 +0100
@@ -82,6 +82,9 @@ static inline void switch_mm(struct mm_s
 {
 	unsigned cpu = smp_processor_id();
 	struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+#ifdef CONFIG_X86_64
+	pgd_t *upgd;
+#endif
 
 	if (likely(prev != next)) {
 		BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
@@ -98,10 +101,11 @@ static inline void switch_mm(struct mm_s
 		op->arg1.mfn = virt_to_mfn(next->pgd);
 		op++;
 
-		/* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
+		/* xen_new_user_pt(next->pgd) */
 #ifdef CONFIG_X86_64
 		op->cmd = MMUEXT_NEW_USER_BASEPTR;
-		op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
+		upgd = __user_pgd(next->pgd);
+		op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
 		op++;
 #endif
 
@@ -132,7 +136,7 @@ static inline void switch_mm(struct mm_s
 			 * to make sure to use no freed page tables.
 			 */
 			load_cr3(next->pgd);
-			xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+			xen_new_user_pt(next->pgd);
 			load_LDT_nolock(&next->context);
 		}
 	}
--- head.orig/arch/x86/include/mach-xen/asm/pgalloc.h	2011-02-03 14:41:13.000000000 +0100
+++ head/arch/x86/include/mach-xen/asm/pgalloc.h	2011-02-03 14:42:36.000000000 +0100
@@ -123,15 +123,13 @@ static inline void pud_populate(struct m
 #endif	/* CONFIG_X86_PAE */
 
 #if PAGETABLE_LEVELS > 3
-#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
-
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
 	pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud));
 
 	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
 	if (unlikely(PagePinned(virt_to_page(pgd))))
-		xen_l4_entry_update(pgd, 1, ent);
+		xen_l4_entry_update(pgd, ent);
 	else
 		*__user_pgd(pgd) = *pgd = ent;
 }
--- head.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2011-03-23 10:12:10.000000000 +0100
+++ head/arch/x86/include/mach-xen/asm/pgtable_64.h	2011-03-23 10:13:50.000000000 +0100
@@ -106,18 +106,25 @@ static inline void xen_set_pud(pud_t *pu
 	: (void)(*__pudp = xen_make_pud(0));	\
 })
 
-#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+static inline pgd_t *__user_pgd(pgd_t *pgd)
+{
+	if (unlikely(((unsigned long)pgd & PAGE_MASK)
+		     == (unsigned long)init_level4_pgt))
+		return NULL;
+	return (pgd_t *)(virt_to_page(pgd)->private
+			 + ((unsigned long)pgd & ~PAGE_MASK));
+}
 
 static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-	xen_l4_entry_update(pgdp, 0, pgd);
+	xen_l4_entry_update(pgdp, pgd);
 }
 
 #define xen_pgd_clear(pgd)			\
 ({						\
 	pgd_t *__pgdp = (pgd);			\
 	PagePinned(virt_to_page(__pgdp))	\
-	? xen_l4_entry_update(__pgdp, 1, xen_make_pgd(0)) \
+	? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \
 	: (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \
 })
 
--- head.orig/arch/x86/kernel/cpu/common-xen.c	2012-04-11 17:13:36.000000000 +0200
+++ head/arch/x86/kernel/cpu/common-xen.c	2012-04-11 17:55:53.000000000 +0200
@@ -1087,8 +1087,7 @@ DEFINE_PER_CPU_FIRST(union irq_stack_uni
 void xen_switch_pt(void)
 {
 #ifdef CONFIG_XEN
-	xen_pt_switch(__pa_symbol(init_level4_pgt));
-	xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
+	xen_pt_switch(init_level4_pgt);
 #endif
 }
 
--- head.orig/arch/x86/kernel/head_64-xen.S	2011-08-09 11:07:15.000000000 +0200
+++ head/arch/x86/kernel/head_64-xen.S	2011-08-09 11:17:44.000000000 +0200
@@ -57,14 +57,6 @@ ENTRY(name)
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(init_level4_pgt)
 	.fill	512,8,0
-        /*
-         * We update two pgd entries to make kernel and user pgd consistent
-         * at pgd_populate(). It can be used for kernel modules. So we place 
-         * this page here for those cases to avoid memory corruption.
-         * We also use this page to establish the initial mapping for the
-         * vsyscall area.
-         */
-	.fill	512,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	512,8,0
--- head.orig/arch/x86/mm/hypervisor.c	2012-05-31 14:50:18.000000000 +0200
+++ head/arch/x86/mm/hypervisor.c	2012-05-31 14:50:28.000000000 +0200
@@ -519,7 +519,7 @@ void xen_l3_entry_update(pud_t *ptr, pud
 #endif
 
 #ifdef CONFIG_X86_64
-void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val)
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
 {
 	mmu_update_t u[2];
 	struct page *page = NULL;
@@ -532,8 +532,11 @@ void xen_l4_entry_update(pgd_t *ptr, int
 	}
 	u[0].ptr = virt_to_machine(ptr);
 	u[0].val = __pgd_val(val);
-	if (user) {
-		u[1].ptr = virt_to_machine(__user_pgd(ptr));
+	if (((unsigned long)ptr & ~PAGE_MASK)
+	    <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
+		ptr = __user_pgd(ptr);
+		BUG_ON(!ptr);
+		u[1].ptr = virt_to_machine(ptr);
 		u[1].val = __pgd_val(val);
 		do_lN_entry_update(u, 2, page);
 	} else
@@ -541,21 +544,25 @@ void xen_l4_entry_update(pgd_t *ptr, int
 }
 #endif /* CONFIG_X86_64 */
 
-void xen_pt_switch(unsigned long ptr)
+#ifdef CONFIG_X86_64
+void xen_pt_switch(pgd_t *pgd)
 {
 	struct mmuext_op op;
 	op.cmd = MMUEXT_NEW_BASEPTR;
-	op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+	op.arg1.mfn = virt_to_mfn(pgd);
 	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
 }
 
-void xen_new_user_pt(unsigned long ptr)
+void xen_new_user_pt(pgd_t *pgd)
 {
 	struct mmuext_op op;
+
+	pgd = __user_pgd(pgd);
 	op.cmd = MMUEXT_NEW_USER_BASEPTR;
-	op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+	op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
 	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
 }
+#endif
 
 void xen_tlb_flush(void)
 {
@@ -632,7 +639,14 @@ void xen_pgd_pin(pgd_t *pgd)
 	op[0].arg1.mfn = virt_to_mfn(pgd);
 #ifdef CONFIG_X86_64
 	op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
-	op[1].arg1.mfn = virt_to_mfn(__user_pgd(pgd));
+	pgd = __user_pgd(pgd);
+	if (pgd)
+		op[1].arg1.mfn = virt_to_mfn(pgd);
+	else {
+		op[1].cmd = MMUEXT_PIN_L3_TABLE;
+		op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
+					    >> PAGE_SHIFT);
+	}
 #endif
 	if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
 		BUG();
@@ -645,8 +659,10 @@ void xen_pgd_unpin(pgd_t *pgd)
 	op[0].cmd = MMUEXT_UNPIN_TABLE;
 	op[0].arg1.mfn = virt_to_mfn(pgd);
 #ifdef CONFIG_X86_64
+	pgd = __user_pgd(pgd);
+	BUG_ON(!pgd);
 	op[1].cmd = MMUEXT_UNPIN_TABLE;
-	op[1].arg1.mfn = virt_to_mfn(__user_pgd(pgd));
+	op[1].arg1.mfn = virt_to_mfn(pgd);
 #endif
 	if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
 		BUG();
--- head.orig/arch/x86/mm/init_64-xen.c	2012-04-11 17:55:40.000000000 +0200
+++ head/arch/x86/mm/init_64-xen.c	2012-04-11 17:55:48.000000000 +0200
@@ -760,9 +760,6 @@ void __init xen_init_pt(void)
 	       (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
 	       * sizeof(*level3_kernel_pgt));
 
-	__user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
-		__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
-
 	/* Do an early initialization of the fixmap area. */
 	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
 	if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
@@ -778,8 +775,6 @@ void __init xen_init_pt(void)
 
 	early_make_page_readonly(init_level4_pgt,
 				 XENFEAT_writable_page_tables);
-	early_make_page_readonly(__user_pgd(init_level4_pgt),
-				 XENFEAT_writable_page_tables);
 	early_make_page_readonly(level3_kernel_pgt,
 				 XENFEAT_writable_page_tables);
 	early_make_page_readonly(level3_user_pgt,
--- head.orig/arch/x86/mm/pgtable-xen.c	2010-11-23 16:31:40.000000000 +0100
+++ head/arch/x86/mm/pgtable-xen.c	2011-04-11 16:14:31.000000000 +0200
@@ -291,9 +291,11 @@ static void pgd_walk(pgd_t *pgd_base, pg
 			BUG();
 		seq = 0;
 	}
+	pgd = __user_pgd(pgd_base);
+	BUG_ON(!pgd);
 	MULTI_update_va_mapping(mcl + seq,
-	       (unsigned long)__user_pgd(pgd_base),
-	       pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+	       (unsigned long)pgd,
+	       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
 	       0);
 	MULTI_update_va_mapping(mcl + seq + 1,
 	       (unsigned long)pgd_base,
@@ -688,19 +690,37 @@ static void pgd_prepopulate_pmd(struct m
 	}
 }
 
+static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
+{
 #ifdef CONFIG_X86_64
-/* We allocate two contiguous pages for kernel and user. */
-#define PGD_ORDER 1
-#else
-#define PGD_ORDER 0
+	if (pgd) {
+		pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
+
+		if (upgd)
+			set_page_private(virt_to_page(pgd),
+					 (unsigned long)upgd);
+		else {
+			free_page((unsigned long)pgd);
+			pgd = NULL;
+		}
+	}
+#endif
+	return pgd;
+}
+
+static inline void user_pgd_free(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	free_page(page_private(virt_to_page(pgd)));
 #endif
+}
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
 
-	pgd = (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ORDER);
+	pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
 
 	if (pgd == NULL)
 		goto out;
@@ -739,7 +759,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 out_free_pmds:
 	free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
 out_free_pgd:
-	free_pages((unsigned long)pgd, PGD_ORDER);
+	user_pgd_free(pgd);
+	free_page((unsigned long)pgd);
 out:
 	return NULL;
 }
@@ -758,7 +779,8 @@ void pgd_free(struct mm_struct *mm, pgd_
 
 	pgd_mop_up_pmds(mm, pgd);
 	paravirt_pgd_free(mm, pgd);
-	free_pages((unsigned long)pgd, PGD_ORDER);
+	user_pgd_free(pgd);
+	free_page((unsigned long)pgd);
 }
 
 /* blktap and gntdev need this, as otherwise they would implicitly (and
--- head.orig/drivers/xen/core/machine_reboot.c	2011-11-18 17:17:11.000000000 +0100
+++ head/drivers/xen/core/machine_reboot.c	2011-11-18 17:18:17.000000000 +0100
@@ -174,8 +174,7 @@ static int take_machine_down(void *_susp
 		 * in fast-suspend mode as that implies a new enough Xen.
 		 */
 		if (!suspend->fast_suspend)
-			xen_new_user_pt(__pa(__user_pgd(
-				current->active_mm->pgd)));
+			xen_new_user_pt(current->active_mm->pgd);
 #endif
 	}