Borislav Petkov 24132f
From: Peter Zijlstra <peterz@infradead.org>
Borislav Petkov 24132f
Date: Tue, 14 Jun 2022 23:15:58 +0200
Borislav Petkov 24132f
Subject: intel_idle: Disable IBRS during long idle
Borislav Petkov 24132f
Git-commit: bf5835bcdb9635c97f85120dba9bfa21e111130f
Borislav Petkov d06c64
Patch-mainline: v5.19-rc4
Borislav Petkov 24132f
References: bsc#1199657 CVE-2022-29900 CVE-2022-29901
Borislav Petkov 24132f
Borislav Petkov 24132f
Having IBRS enabled while the SMT sibling is idle unnecessarily slows
Borislav Petkov 24132f
down the running sibling. OTOH, disabling IBRS around idle takes two
Borislav Petkov 24132f
MSR writes, which will increase the idle latency.
Borislav Petkov 24132f
Borislav Petkov 24132f
Therefore, only disable IBRS around deeper idle states. Shallow idle
Borislav Petkov 24132f
states are bounded by the tick in duration, since NOHZ is not allowed
Borislav Petkov 24132f
for them by virtue of their short target residency.
Borislav Petkov 24132f
Borislav Petkov 24132f
Only do this for mwait-driven idle, since that keeps interrupts disabled
Borislav Petkov 24132f
across idle, which makes disabling IBRS vs IRQ-entry a non-issue.
Borislav Petkov 24132f
Borislav Petkov 24132f
Note: C6 is a random threshold, most importantly C1 probably shouldn't
Borislav Petkov 24132f
disable IBRS, benchmarking needed.
Borislav Petkov 24132f
Borislav Petkov 24132f
Suggested-by: Tim Chen <tim.c.chen@linux.intel.com>
Borislav Petkov 24132f
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Borislav Petkov 24132f
Signed-off-by: Borislav Petkov <bp@suse.de>
Borislav Petkov 24132f
Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Borislav Petkov 24132f
Signed-off-by: Borislav Petkov <bp@suse.de>
Borislav Petkov 24132f
---
Borislav Petkov 24132f
 arch/x86/include/asm/nospec-branch.h |    1 
Borislav Petkov 24132f
 arch/x86/kernel/cpu/bugs.c           |    6 +++
Borislav Petkov 24132f
 drivers/idle/intel_idle.c            |   64 +++++++++++++++++++++++++++++++----
Borislav Petkov 24132f
 3 files changed, 65 insertions(+), 6 deletions(-)
Borislav Petkov 24132f
Borislav Petkov 24132f
--- a/arch/x86/include/asm/nospec-branch.h
Borislav Petkov 24132f
+++ b/arch/x86/include/asm/nospec-branch.h
Borislav Petkov 24132f
@@ -345,6 +345,7 @@ static inline void unrestrict_branch_spe
Borislav Petkov 24132f
 /* The Intel SPEC CTRL MSR base value cache */
Borislav Petkov 24132f
 extern u64 x86_spec_ctrl_base;
Borislav Petkov 24132f
 extern void write_spec_ctrl_current(u64 val, bool force);
Borislav Petkov 24132f
+extern u64 spec_ctrl_current(void);
Borislav Petkov 24132f
 
Borislav Petkov 24132f
 /*
Borislav Petkov 24132f
  * With retpoline, we must use IBRS to restrict branch prediction
Borislav Petkov 24132f
--- a/arch/x86/kernel/cpu/bugs.c
Borislav Petkov 24132f
+++ b/arch/x86/kernel/cpu/bugs.c
Borislav Petkov 24132f
@@ -76,6 +76,12 @@ void write_spec_ctrl_current(u64 val, bo
Borislav Petkov 24132f
 		wrmsrl(MSR_IA32_SPEC_CTRL, val);
Borislav Petkov 24132f
 }
Borislav Petkov 24132f
 
Borislav Petkov 24132f
+u64 spec_ctrl_current(void)
Borislav Petkov 24132f
+{
Borislav Petkov 24132f
+	return this_cpu_read(x86_spec_ctrl_current);
Borislav Petkov 24132f
+}
Borislav Petkov 24132f
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
Borislav Petkov 24132f
+
Borislav Petkov 24132f
 /*
Borislav Petkov 24132f
  * The vendor and possibly platform specific bits which can be modified in
Borislav Petkov 24132f
  * x86_spec_ctrl_base.
Borislav Petkov 24132f
--- a/drivers/idle/intel_idle.c
Borislav Petkov 24132f
+++ b/drivers/idle/intel_idle.c
Borislav Petkov 24132f
@@ -56,11 +56,13 @@
Borislav Petkov 24132f
 #include <linux/tick.h>
Borislav Petkov 24132f
 #include <trace/events/power.h>
Borislav Petkov 24132f
 #include <linux/sched.h>
Borislav Petkov 24132f
+#include <linux/sched/smt.h>
Borislav Petkov 24132f
 #include <linux/notifier.h>
Borislav Petkov 24132f
 #include <linux/cpu.h>
Borislav Petkov 24132f
 #include <linux/moduleparam.h>
Borislav Petkov 24132f
 #include <asm/cpu_device_id.h>
Borislav Petkov 24132f
 #include <asm/intel-family.h>
Borislav Petkov 24132f
+#include <asm/nospec-branch.h>
Borislav Petkov 24132f
 #include <asm/mwait.h>
Borislav Petkov 24132f
 #include <asm/msr.h>
Borislav Petkov 24132f
 
Borislav Petkov 24132f
@@ -101,6 +103,12 @@ static void intel_idle_freeze(struct cpu
Borislav Petkov 24132f
 static struct cpuidle_state *cpuidle_state_table;
Borislav Petkov 24132f
 
Borislav Petkov 24132f
 /*
Borislav Petkov 24132f
+ * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
Borislav Petkov 24132f
+ * above.
Borislav Petkov 24132f
+ */
Borislav Petkov 24132f
+#define CPUIDLE_FLAG_IBRS		BIT(16)
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+/*
Borislav Petkov 24132f
  * Set this flag for states where the HW flushes the TLB for us
Borislav Petkov 24132f
  * and so we don't need cross-calls to keep it consistent.
Borislav Petkov 24132f
  * If this flag is set, SW flushes the TLB, so even if the
Borislav Petkov 24132f
@@ -616,7 +624,7 @@ static struct cpuidle_state skl_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C6",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x20",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 85,
Borislav Petkov 24132f
 		.target_residency = 200,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -624,7 +632,7 @@ static struct cpuidle_state skl_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C7s",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x33",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 124,
Borislav Petkov 24132f
 		.target_residency = 800,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -632,7 +640,7 @@ static struct cpuidle_state skl_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C8",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x40",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 200,
Borislav Petkov 24132f
 		.target_residency = 800,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -640,7 +648,7 @@ static struct cpuidle_state skl_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C9",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x50",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 480,
Borislav Petkov 24132f
 		.target_residency = 5000,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -648,7 +656,7 @@ static struct cpuidle_state skl_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C10",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x60",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 890,
Borislav Petkov 24132f
 		.target_residency = 5000,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -677,7 +685,7 @@ static struct cpuidle_state skx_cstates[
Borislav Petkov 24132f
 	{
Borislav Petkov 24132f
 		.name = "C6",
Borislav Petkov 24132f
 		.desc = "MWAIT 0x20",
Borislav Petkov 24132f
-		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
Borislav Petkov 24132f
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
Borislav Petkov 24132f
 		.exit_latency = 133,
Borislav Petkov 24132f
 		.target_residency = 600,
Borislav Petkov 24132f
 		.enter = &intel_idle,
Borislav Petkov 24132f
@@ -934,6 +942,46 @@ static __cpuidle int intel_idle(struct c
Borislav Petkov 24132f
 	return index;
Borislav Petkov 24132f
 }
Borislav Petkov 24132f
 
Borislav Petkov 24132f
+/*
Borislav Petkov 24132f
+ * MWAIT takes an 8-bit "hint" in EAX "suggesting"
Borislav Petkov 24132f
+ * the C-state (top nibble) and sub-state (bottom nibble)
Borislav Petkov 24132f
+ * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc.
Borislav Petkov 24132f
+ *
Borislav Petkov 24132f
+ * We store the hint at the top of our "flags" for each state.
Borislav Petkov 24132f
+ */
Borislav Petkov 24132f
+#define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
Borislav Petkov 24132f
+#define MWAIT2flg(eax) ((eax & 0xFF) << 24)
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+static __always_inline int __intel_idle(struct cpuidle_device *dev,
Borislav Petkov 24132f
+					struct cpuidle_driver *drv, int index)
Borislav Petkov 24132f
+{
Borislav Petkov 24132f
+	struct cpuidle_state *state = &drv->states[index];
Borislav Petkov 24132f
+	unsigned long eax = flg2MWAIT(state->flags);
Borislav Petkov 24132f
+	unsigned long ecx = 1; /* break on interrupt flag */
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	mwait_idle_with_hints(eax, ecx);
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	return index;
Borislav Petkov 24132f
+}
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
Borislav Petkov 24132f
+				     struct cpuidle_driver *drv, int index)
Borislav Petkov 24132f
+{
Borislav Petkov 24132f
+	bool smt_active = sched_smt_active();
Borislav Petkov 24132f
+	u64 spec_ctrl = spec_ctrl_current();
Borislav Petkov 24132f
+	int ret;
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	if (smt_active)
Borislav Petkov 24132f
+		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	ret = __intel_idle(dev, drv, index);
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	if (smt_active)
Borislav Petkov 24132f
+		wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
Borislav Petkov 24132f
+
Borislav Petkov 24132f
+	return ret;
Borislav Petkov 24132f
+}
Borislav Petkov 24132f
+
Borislav Petkov 24132f
 /**
Borislav Petkov 24132f
  * intel_idle_freeze - simplified "enter" callback routine for suspend-to-idle
Borislav Petkov 24132f
  * @dev: cpuidle_device
Borislav Petkov 24132f
@@ -1366,6 +1414,10 @@ static void __init intel_idle_cpuidle_dr
Borislav Petkov 24132f
 			continue;
Borislav Petkov 24132f
 		}
Borislav Petkov 24132f
 
Borislav Petkov 727875
+		if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
Borislav Petkov 24132f
+		    cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
Borislav Petkov 24132f
+			drv->states[drv->state_count].enter = intel_idle_ibrs;
Borislav Petkov 24132f
+		}
Borislav Petkov 24132f
 
Borislav Petkov 24132f
 		if (((mwait_cstate + 1) > 2) &&
Borislav Petkov 24132f
 			!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))