Borislav Petkov 0f0e4c
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Borislav Petkov 0f0e4c
Date: Thu, 19 May 2022 20:35:15 -0700
Borislav Petkov 0f0e4c
Subject: KVM: x86/speculation: Disable Fill buffer clear within guests
Borislav Petkov 0f0e4c
Git-commit: 027bbb884be006b05d9c577d6401686053aa789e
Borislav Petkov 0f0e4c
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
Borislav Petkov 0f0e4c
Patch-mainline: Queued in tip for v5.19
Borislav Petkov 0f0e4c
References: bsc#1199650 CVE-2022-21166 CVE-2022-21127 CVE-2022-21123 CVE-2022-21125 CVE-2022-21180
Borislav Petkov 0f0e4c
Borislav Petkov 0f0e4c
The enumeration of MD_CLEAR in CPUID(EAX=7,ECX=0).EDX{bit 10} is not an
Borislav Petkov 0f0e4c
accurate indicator on all CPUs of whether the VERW instruction will
Borislav Petkov 0f0e4c
overwrite fill buffers. FB_CLEAR enumeration in
Borislav Petkov 0f0e4c
IA32_ARCH_CAPABILITIES{bit 17} covers the case of CPUs that are not
Borislav Petkov 0f0e4c
vulnerable to MDS/TAA, indicating that microcode does overwrite fill
Borislav Petkov 0f0e4c
buffers.
Borislav Petkov 0f0e4c
Borislav Petkov 0f0e4c
Guests running in VMM environments may not be aware of all the
Borislav Petkov 0f0e4c
capabilities/vulnerabilities of the host CPU. Specifically, a guest may
Borislav Petkov 0f0e4c
apply MDS/TAA mitigations when a virtual CPU is enumerated as vulnerable
Borislav Petkov 0f0e4c
to MDS/TAA even when the physical CPU is not. On CPUs that enumerate
Borislav Petkov 0f0e4c
FB_CLEAR_CTRL the VMM may set FB_CLEAR_DIS to skip overwriting of fill
Borislav Petkov 0f0e4c
buffers by the VERW instruction. This is done by setting FB_CLEAR_DIS
Borislav Petkov 0f0e4c
during VMENTER and resetting on VMEXIT. For guests that enumerate
Borislav Petkov 0f0e4c
FB_CLEAR (explicitly asking for fill buffer clear capability) the VMM
Borislav Petkov 0f0e4c
will not use FB_CLEAR_DIS.
Borislav Petkov 0f0e4c
Borislav Petkov 0f0e4c
Irrespective of guest state, host overwrites CPU buffers before VMENTER
Borislav Petkov 0f0e4c
to protect itself from an MMIO capable guest, as part of mitigation for
Borislav Petkov 0f0e4c
MMIO Stale Data vulnerabilities.
Borislav Petkov 0f0e4c
Borislav Petkov 0f0e4c
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Borislav Petkov 0f0e4c
Signed-off-by: Borislav Petkov <bp@suse.de>
Borislav Petkov 0f0e4c
---
Borislav Petkov 0f0e4c
 arch/x86/include/asm/msr-index.h |    6 +++
Borislav Petkov 0f0e4c
 arch/x86/kvm/vmx.c               |   73 +++++++++++++++++++++++++++++++++++++++
Borislav Petkov 0f0e4c
 arch/x86/kvm/x86.c               |    3 +
Borislav Petkov 0f0e4c
 3 files changed, 82 insertions(+)
Borislav Petkov 0f0e4c
Borislav Petkov 0f0e4c
--- a/arch/x86/include/asm/msr-index.h
Borislav Petkov 0f0e4c
+++ b/arch/x86/include/asm/msr-index.h
Borislav Petkov 0f0e4c
@@ -123,6 +123,11 @@
Borislav Petkov 0f0e4c
 						 * VERW clears CPU fill buffer
Borislav Petkov 0f0e4c
 						 * even on MDS_NO CPUs.
Borislav Petkov 0f0e4c
 						 */
Borislav Petkov 0f0e4c
+#define ARCH_CAP_FB_CLEAR_CTRL		BIT(18)	/*
Borislav Petkov 0f0e4c
+						 * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
Borislav Petkov 0f0e4c
+						 * bit available to control VERW
Borislav Petkov 0f0e4c
+						 * behavior.
Borislav Petkov 0f0e4c
+						 */
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
 #define MSR_IA32_FLUSH_CMD		0x0000010b
Borislav Petkov 0f0e4c
 #define L1D_FLUSH			BIT(0)	/*
Borislav Petkov 0f0e4c
@@ -142,6 +147,7 @@
Borislav Petkov 0f0e4c
 /* SRBDS support */
Borislav Petkov 0f0e4c
 #define MSR_IA32_MCU_OPT_CTRL		0x00000123
Borislav Petkov 0f0e4c
 #define RNGDS_MITG_DIS			BIT(0)
Borislav Petkov 0f0e4c
+#define FB_CLEAR_DIS			BIT(3)	/* CPU Fill buffer clear disable */
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
 #define MSR_IA32_SYSENTER_CS		0x00000174
Borislav Petkov 0f0e4c
 #define MSR_IA32_SYSENTER_ESP		0x00000175
Borislav Petkov 0f0e4c
--- a/arch/x86/kvm/vmx.c
Borislav Petkov 0f0e4c
+++ b/arch/x86/kvm/vmx.c
Borislav Petkov 0f0e4c
@@ -209,6 +209,9 @@ static const struct {
Borislav Petkov 0f0e4c
 #define L1D_CACHE_ORDER 4
Borislav Petkov 0f0e4c
 static void *vmx_l1d_flush_pages;
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+/* Control for disabling CPU Fill buffer clear */
Borislav Petkov 0f0e4c
+static bool __read_mostly vmx_fb_clear_ctrl_available;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
Borislav Petkov 0f0e4c
 {
Borislav Petkov 0f0e4c
 	struct page *page;
Borislav Petkov 0f0e4c
@@ -824,6 +827,9 @@ struct vcpu_vmx {
Borislav Petkov 0f0e4c
 	 */
Borislav Petkov 0f0e4c
 	u64 msr_ia32_feature_control;
Borislav Petkov 0f0e4c
 	u64 msr_ia32_feature_control_valid_bits;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	u64 msr_ia32_mcu_opt_ctrl;
Borislav Petkov 0f0e4c
+	bool disable_fb_clear;
Borislav Petkov 0f0e4c
 };
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
 enum segment_cache_field {
Borislav Petkov 0f0e4c
@@ -1693,6 +1699,61 @@ static void vmcs_load(struct vmcs *vmcs)
Borislav Petkov 0f0e4c
 		       vmcs, phys_addr);
Borislav Petkov 0f0e4c
 }
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+static void vmx_setup_fb_clear_ctrl(void)
Borislav Petkov 0f0e4c
+{
Borislav Petkov 0f0e4c
+	u64 msr;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
Borislav Petkov 0f0e4c
+	    !boot_cpu_has_bug(X86_BUG_MDS) &&
Borislav Petkov 0f0e4c
+	    !boot_cpu_has_bug(X86_BUG_TAA)) {
Borislav Petkov 0f0e4c
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
Borislav Petkov 0f0e4c
+		if (msr & ARCH_CAP_FB_CLEAR_CTRL)
Borislav Petkov 0f0e4c
+			vmx_fb_clear_ctrl_available = true;
Borislav Petkov 0f0e4c
+	}
Borislav Petkov 0f0e4c
+}
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
Borislav Petkov 0f0e4c
+{
Borislav Petkov 0f0e4c
+	u64 msr;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	if (!vmx->disable_fb_clear)
Borislav Petkov 0f0e4c
+		return;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
Borislav Petkov 0f0e4c
+	msr |= FB_CLEAR_DIS;
Borislav Petkov 0f0e4c
+	wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
Borislav Petkov 0f0e4c
+	/* Cache the MSR value to avoid reading it later */
Borislav Petkov 0f0e4c
+	vmx->msr_ia32_mcu_opt_ctrl = msr;
Borislav Petkov 0f0e4c
+}
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
Borislav Petkov 0f0e4c
+{
Borislav Petkov 0f0e4c
+	if (!vmx->disable_fb_clear)
Borislav Petkov 0f0e4c
+		return;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
Borislav Petkov 0f0e4c
+	wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
Borislav Petkov 0f0e4c
+}
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
Borislav Petkov 0f0e4c
+{
Borislav Petkov 0f0e4c
+	vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	/*
Borislav Petkov 0f0e4c
+	 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
Borislav Petkov 0f0e4c
+	 * at VMEntry. Skip the MSR read/write when a guest has no use case to
Borislav Petkov 0f0e4c
+	 * execute VERW.
Borislav Petkov 0f0e4c
+	 */
Borislav Petkov 0f0e4c
+	if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
Borislav Petkov 0f0e4c
+	   ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
Borislav Petkov 0f0e4c
+	    (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
Borislav Petkov 0f0e4c
+	    (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
Borislav Petkov 0f0e4c
+	    (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
Borislav Petkov 0f0e4c
+	    (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
Borislav Petkov 0f0e4c
+		vmx->disable_fb_clear = false;
Borislav Petkov 0f0e4c
+}
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 #ifdef CONFIG_KEXEC_CORE
Borislav Petkov 0f0e4c
 /*
Borislav Petkov 0f0e4c
  * This bitmap is used to indicate whether the vmclear
Borislav Petkov 0f0e4c
@@ -3775,6 +3836,10 @@ static int vmx_set_msr(struct kvm_vcpu *
Borislav Petkov 0f0e4c
 			ret = kvm_set_msr_common(vcpu, msr_info);
Borislav Petkov 0f0e4c
 	}
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+	/* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
Borislav Petkov 0f0e4c
+	if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
Borislav Petkov 0f0e4c
+		vmx_update_fb_clear_dis(vcpu, vmx);
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 	return ret;
Borislav Petkov 0f0e4c
 }
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
@@ -6109,6 +6174,8 @@ static void vmx_vcpu_reset(struct kvm_vc
Borislav Petkov 0f0e4c
 	update_exception_bitmap(vcpu);
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
 	vpid_sync_context(vmx->vpid);
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
+	vmx_update_fb_clear_dis(vcpu, vmx);
Borislav Petkov 0f0e4c
 }
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
 /*
Borislav Petkov 0f0e4c
@@ -9862,6 +9929,8 @@ static void __noclone vmx_vcpu_run(struc
Borislav Petkov 0f0e4c
 		 kvm_arch_has_assigned_device(vcpu->kvm))
Borislav Petkov 0f0e4c
 		mds_clear_cpu_buffers();
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+	vmx_disable_fb_clear(vmx);
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 	asm(
Borislav Petkov 0f0e4c
 		/* Store host registers */
Borislav Petkov 0f0e4c
 		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
Borislav Petkov 0f0e4c
@@ -9991,6 +10060,8 @@ static void __noclone vmx_vcpu_run(struc
Borislav Petkov 0f0e4c
 #endif
Borislav Petkov 0f0e4c
 	      );
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+	vmx_enable_fb_clear(vmx);
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 	/*
Borislav Petkov 0f0e4c
 	 * We do not use IBRS in the kernel. If this vCPU has used the
Borislav Petkov 0f0e4c
 	 * SPEC_CTRL MSR it may have left it on; save the value and
Borislav Petkov 0f0e4c
@@ -13140,6 +13211,8 @@ static int __init vmx_init(void)
Borislav Petkov 0f0e4c
 		}
Borislav Petkov 0f0e4c
 	}
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+	vmx_setup_fb_clear_ctrl();
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 #ifdef CONFIG_KEXEC_CORE
Borislav Petkov 0f0e4c
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
Borislav Petkov 0f0e4c
 			   crash_vmclear_local_loaded_vmcss);
Borislav Petkov 0f0e4c
--- a/arch/x86/kvm/x86.c
Borislav Petkov 0f0e4c
+++ b/arch/x86/kvm/x86.c
Borislav Petkov 0f0e4c
@@ -1125,6 +1125,9 @@ u64 kvm_get_arch_capabilities(void)
Borislav Petkov 0f0e4c
 	 */
Borislav Petkov 0f0e4c
 	data |= ARCH_CAP_PSCHANGE_MC_NO;
Borislav Petkov 0f0e4c
 
Borislav Petkov 0f0e4c
+	/* Guests don't need to know "Fill buffer clear control" exists */
Borislav Petkov 0f0e4c
+	data &= ~ARCH_CAP_FB_CLEAR_CTRL;
Borislav Petkov 0f0e4c
+
Borislav Petkov 0f0e4c
 	return data;
Borislav Petkov 0f0e4c
 }
Borislav Petkov 0f0e4c