Blob Blame History Raw
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 May 2018 16:39:51 -0700
Subject: [PATCH 8/8] mm, l1tf: Disallow non privileged high MMIO PROT_NONE
 mappings
Patch-mainline: v4.19-rc1
Git-commit: 42e4089c7890725fcd329999252dc489b72f2921
References: bnc#1087081, CVE-2018-3620

For L1TF PROT_NONE mappings are protected by inverting the PFN in the
page table entry. This sets the high bits in the CPU's address space,
thus making sure to point to not point an unmapped entry to valid
cached memory.

Some server system BIOS put the MMIO mappings high up in the physical
address space. If such an high mapping was mapped to an unprivileged
user they could attack low memory by setting such a mapping to
PROT_NONE. This could happen through a special device driver
which is not access protected. Normal /dev/mem is of course
access protect.

To avoid this we forbid PROT_NONE mappings or mprotect for high MMIO
mappings.

Valid page mappings are allowed because the system is then unsafe
anyways.

We don't expect users to commonly use PROT_NONE on MMIO. But
to minimize any impact here we only do this if the mapping actually
refers to a high MMIO address (defined as the MAX_PA-1 bit being set),
and also skip the check for root.

For mmaps this is straight forward and can be handled in vm_insert_pfn
and in remap_pfn_range().

For mprotect it's a bit trickier. At the point we're looking at the
actual PTEs a lot of state has been changed and would be difficult
to undo on an error. Since this is a uncommon case we use a separate
early page talk walk pass for MMIO PROT_NONE mappings that
checks for this condition early. For non MMIO and non PROT_NONE
there are no changes.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-By: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>

---
v2: Use new helpers added earlier
v3: Fix inverted check added in v3
v4: Use l1tf_pfn_limit (Thomas)
Add comment for locked down kernels
v5: Use boot_cpu_has_bug. Check bug early in arch_has_pfn_modify_check
---
 arch/x86/include/asm/pgtable.h |    8 ++++++
 arch/x86/mm/mmap.c             |   21 +++++++++++++++++
 include/asm-generic/pgtable.h  |   12 ++++++++++
 mm/memory.c                    |   37 ++++++++++++++++++++++--------
 mm/mprotect.c                  |   49 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 10 deletions(-)

--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1245,6 +1245,14 @@ static inline u16 pte_flags_pkey(unsigne
 #endif
 }
 
+#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
+extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+	return boot_cpu_has_bug(X86_BUG_L1TF);
+}
+
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */
 
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -172,3 +172,24 @@ const char *arch_vma_name(struct vm_area
 		return "[mpx]";
 	return NULL;
 }
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+	if (!boot_cpu_has_bug(X86_BUG_L1TF))
+		return true;
+	if (!__pte_needs_invert(pgprot_val(prot)))
+		return true;
+	/* If it's real memory always allow */
+	if (pfn_valid(pfn))
+		return true;
+	if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+		return false;
+	return true;
+}
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -995,4 +995,16 @@ static inline void init_espfix_bsp(void)
 #endif
 #endif
 
+#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
+static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+	return true;
+}
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _ASM_GENERIC_PGTABLE_H */
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1874,6 +1874,9 @@ int vm_insert_pfn_prot(struct vm_area_st
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 
+	if (!pfn_modify_allowed(pfn, pgprot))
+		return -EACCES;
+
 	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
 	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1895,6 +1898,9 @@ static int __vm_insert_mixed(struct vm_a
 
 	track_pfn_insert(vma, &pgprot, pfn);
 
+	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+		return -EACCES;
+
 	/*
 	 * If we don't have pte special, then we have to use the pfn_valid()
 	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1942,6 +1948,7 @@ static int remap_pte_range(struct mm_str
 {
 	pte_t *pte;
 	spinlock_t *ptl;
+	int err = 0;
 
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
@@ -1949,12 +1956,16 @@ static int remap_pte_range(struct mm_str
 	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
+		if (!pfn_modify_allowed(pfn, prot)) {
+			err = -EACCES;
+			break;
+		}
 		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
-	return 0;
+	return err;
 }
 
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1963,6 +1974,7 @@ static inline int remap_pmd_range(struct
 {
 	pmd_t *pmd;
 	unsigned long next;
+	int err;
 
 	pfn -= addr >> PAGE_SHIFT;
 	pmd = pmd_alloc(mm, pud, addr);
@@ -1971,9 +1983,10 @@ static inline int remap_pmd_range(struct
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	do {
 		next = pmd_addr_end(addr, end);
-		if (remap_pte_range(mm, pmd, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot))
-			return -ENOMEM;
+		err = remap_pte_range(mm, pmd, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err)
+			return err;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
@@ -1984,6 +1997,7 @@ static inline int remap_pud_range(struct
 {
 	pud_t *pud;
 	unsigned long next;
+	int err;
 
 	pfn -= addr >> PAGE_SHIFT;
 	pud = pud_alloc(mm, p4d, addr);
@@ -1991,9 +2005,10 @@ static inline int remap_pud_range(struct
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		if (remap_pmd_range(mm, pud, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot))
-			return -ENOMEM;
+		err = remap_pmd_range(mm, pud, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err)
+			return err;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
@@ -2004,6 +2019,7 @@ static inline int remap_p4d_range(struct
 {
 	p4d_t *p4d;
 	unsigned long next;
+	int err;
 
 	pfn -= addr >> PAGE_SHIFT;
 	p4d = p4d_alloc(mm, pgd, addr);
@@ -2011,9 +2027,10 @@ static inline int remap_p4d_range(struct
 		return -ENOMEM;
 	do {
 		next = p4d_addr_end(addr, end);
-		if (remap_pud_range(mm, p4d, addr, next,
-				pfn + (addr >> PAGE_SHIFT), prot))
-			return -ENOMEM;
+		err = remap_pud_range(mm, p4d, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err)
+			return err;
 	} while (p4d++, addr = next, addr != end);
 	return 0;
 }
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -307,6 +307,42 @@ unsigned long change_protection(struct v
 	return pages;
 }
 
+static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
+			       unsigned long next, struct mm_walk *walk)
+{
+	return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+		0 : -EACCES;
+}
+
+static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
+				   unsigned long addr, unsigned long next,
+				   struct mm_walk *walk)
+{
+	return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+		0 : -EACCES;
+}
+
+static int prot_none_test(unsigned long addr, unsigned long next,
+			  struct mm_walk *walk)
+{
+	return 0;
+}
+
+static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
+			   unsigned long end, unsigned long newflags)
+{
+	pgprot_t new_pgprot = vm_get_page_prot(newflags);
+	struct mm_walk prot_none_walk = {
+		.pte_entry = prot_none_pte_entry,
+		.hugetlb_entry = prot_none_hugetlb_entry,
+		.test_walk = prot_none_test,
+		.mm = current->mm,
+		.private = &new_pgprot,
+	};
+
+	return walk_page_range(start, end, &prot_none_walk);
+}
+
 int
 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long start, unsigned long end, unsigned long newflags)
@@ -325,6 +361,19 @@ mprotect_fixup(struct vm_area_struct *vm
 	}
 
 	/*
+	 * Do PROT_NONE PFN permission checks here when we can still
+	 * bail out without undoing a lot of state. This is a rather
+	 * uncommon case, so doesn't need to be very optimized.
+	 */
+	if (arch_has_pfn_modify_check() &&
+	    (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+	    (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
+		error = prot_none_walk(vma, start, end, newflags);
+		if (error)
+			return error;
+	}
+
+	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
 	 * make it unwritable again. hugetlb mapping were accounted for