Blob Blame History Raw
From: Jane Chu <jane.chu@oracle.com>
Date: Mon, 16 May 2022 11:38:10 -0700
Subject: mce: fix set_mce_nospec to always unmap the whole page
Git-commit: 5898b43af954b83c4a4ee4ab85c4dbafa395822a
Patch-mainline: v5.19-rc1
References: git-fixes

The set_memory_uc() approach doesn't work well in all cases.
As Dan pointed out when "The VMM unmapped the bad page from
guest physical space and passed the machine check to the guest."
"The guest gets virtual #MC on an access to that page. When
the guest tries to do set_memory_uc() and instructs cpa_flush()
to do clean caches that results in taking another fault / exception
perhaps because the VMM unmapped the page from the guest."

Since the driver has special knowledge to handle NP or UC,
mark the poisoned page with NP and let driver handle it when
it comes down to repair.

Please refer to discussions here for more details.
https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/

Now since poisoned page is marked as not-present, in order to
avoid writing to a not-present page and trigger kernel Oops,
also fix pmem_do_write().

Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/165272615484.103830.2563950688772226611.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Nikolay Borisov <nik.borisov@suse.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   11 ++---------
 arch/x86/mm/pageattr.c           |   22 ++++++++++------------
 drivers/nvdimm/pmem.c            |   26 +++++++-------------------
 include/linux/set_memory.h       |    4 ++--
 4 files changed, 21 insertions(+), 42 deletions(-)

--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -534,13 +534,6 @@ bool mce_is_memory_error(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
-static bool whole_page(struct mce *m)
-{
-	if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
-		return true;
-	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
-}
-
 bool mce_is_correctable(struct mce *m)
 {
 	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -609,7 +602,7 @@ static int srao_decode_notifier(struct n
 	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 		pfn = mce->addr >> PAGE_SHIFT;
 		if (!memory_failure(pfn, MCE_VECTOR, 0))
-			set_mce_nospec(pfn, whole_page(mce));
+			set_mce_nospec(pfn);
 	}
 
 	return NOTIFY_OK;
@@ -1115,7 +1108,7 @@ static int do_memory_failure(struct mce
 	if (ret)
 		pr_err("Memory error not recovered");
 	else
-		set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
+		set_mce_nospec(m->addr >> PAGE_SHIFT);
 	return ret;
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1743,14 +1743,9 @@ int set_memory_wb(unsigned long addr, in
 }
 EXPORT_SYMBOL(set_memory_wb);
 
-/*
- * Prevent speculative access to the page by either unmapping
- * it (if we do not require access to any part of the page) or
- * marking it uncacheable (if we want to try to retrieve data
- * from non-poisoned lines in the page).
- */
 #ifdef CONFIG_X86_64
-int set_mce_nospec(unsigned long pfn, bool unmap)
+/* Prevent speculative access to a page by marking it not-present */
+int set_mce_nospec(unsigned long pfn)
 {
 	unsigned long decoy_addr;
 	int rc;
@@ -1769,19 +1764,22 @@ int set_mce_nospec(unsigned long pfn, bo
 	 */
 	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
 
-	if (unmap)
-		rc = set_memory_np(decoy_addr, 1);
-	else
-		rc = set_memory_uc(decoy_addr, 1);
+	rc = set_memory_np(decoy_addr, 1);
 	if (rc)
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
 	return rc;
 }
 
+static int set_memory_present(unsigned long *addr, int numpages)
+{
+       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
 /* Restore full speculative operation to the pfn. */
 int clear_mce_nospec(unsigned long pfn)
 {
-	return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+	unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
+
+	return set_memory_present(&addr, 1);
 }
 EXPORT_SYMBOL_GPL(clear_mce_nospec);
 #endif
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -152,8 +152,14 @@ static blk_status_t pmem_do_bvec(struct
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void *pmem_addr = pmem->virt_addr + pmem_off;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
 		bad_pmem = true;
+		if (is_write) {
+			rc = pmem_clear_poison(pmem, pmem_off, len);
+			if (rc != BLK_STS_OK)
+				return rc;
+		}
+	}
 
 	if (!is_write) {
 		if (unlikely(bad_pmem))
@@ -163,26 +169,8 @@ static blk_status_t pmem_do_bvec(struct
 			flush_dcache_page(page);
 		}
 	} else {
-		/*
-		 * Note that we write the data both before and after
-		 * clearing poison.  The write before clear poison
-		 * handles situations where the latest written data is
-		 * preserved and the clear poison operation simply marks
-		 * the address range as valid without changing the data.
-		 * In this case application software can assume that an
-		 * interrupted write will either return the new good
-		 * data or an error.
-		 *
-		 * However, if pmem_clear_poison() leaves the data in an
-		 * indeterminate state we need to perform the write
-		 * after clear poison.
-		 */
 		flush_dcache_page(page);
 		write_pmem(pmem_addr, page, off, len);
-		if (unlikely(bad_pmem)) {
-			rc = pmem_clear_poison(pmem, pmem_off, len);
-			write_pmem(pmem_addr, page, off, len);
-		}
 	}
 
 	return rc;
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -18,10 +18,10 @@ static inline int set_memory_nx(unsigned
 #endif
 
 #ifdef CONFIG_X86_64
-int set_mce_nospec(unsigned long pfn, bool unmap);
+int set_mce_nospec(unsigned long pfn);
 int clear_mce_nospec(unsigned long pfn);
 #else
-static inline int set_mce_nospec(unsigned long pfn, bool unmap)
+static inline int set_mce_nospec(unsigned long pfn)
 {
 	return 0;
 }