diff --git a/patches.suse/fix-race-between-exit_itimers-and-proc-pid-timers.patch b/patches.suse/fix-race-between-exit_itimers-and-proc-pid-timers.patch new file mode 100644 index 0000000..6089df0 --- /dev/null +++ b/patches.suse/fix-race-between-exit_itimers-and-proc-pid-timers.patch @@ -0,0 +1,90 @@ +From d5b36a4dbd06c5e8e36ca8ccc552f679069e2946 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov +Date: Mon, 11 Jul 2022 18:16:25 +0200 +Subject: [PATCH] fix race between exit_itimers() and /proc/pid/timers +Git-commit: d5b36a4dbd06c5e8e36ca8ccc552f679069e2946 +Patch-mainline: v5.19-rc7 +References: git-fixes + +As Chris explains, the comment above exit_itimers() is not correct, +we can race with proc_timers_seq_ops. Change exit_itimers() to clear +signal->posix_timers with ->siglock held. + +Cc: +Reported-by: chris@accessvector.net +Signed-off-by: Oleg Nesterov +Signed-off-by: Linus Torvalds +Acked-by: Takashi Iwai + +--- + fs/exec.c | 2 +- + include/linux/sched/task.h | 2 +- + kernel/exit.c | 2 +- + kernel/time/posix-timers.c | 19 ++++++++++++++----- + 4 files changed, 17 insertions(+), 8 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1296,7 +1296,7 @@ int begin_new_exec(struct linux_binprm * + bprm->mm = NULL; + + #ifdef CONFIG_POSIX_TIMERS +- exit_itimers(me->signal); ++ exit_itimers(me); + flush_itimer_signals(); + #endif + +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -81,7 +81,7 @@ static inline void exit_thread(struct ta + extern void do_group_exit(int); + + extern void exit_files(struct task_struct *); +-extern void exit_itimers(struct signal_struct *); ++extern void exit_itimers(struct task_struct *); + + extern pid_t kernel_clone(struct kernel_clone_args *kargs); + struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -798,7 +798,7 @@ void __noreturn do_exit(long code) + + #ifdef CONFIG_POSIX_TIMERS + hrtimer_cancel(&tsk->signal->real_timer); +- exit_itimers(tsk->signal); ++ exit_itimers(tsk); + #endif + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); +--- a/kernel/time/posix-timers.c ++++ b/kernel/time/posix-timers.c +@@ -1051,15 +1051,24 @@ retry_delete: + } + + /* +- * This is called by do_exit or de_thread, only when there are no more +- * references to the shared signal_struct. ++ * This is called by do_exit or de_thread, only when nobody else can ++ * modify the signal->posix_timers list. Yet we need sighand->siglock ++ * to prevent the race with /proc/pid/timers. + */ +-void exit_itimers(struct signal_struct *sig) ++void exit_itimers(struct task_struct *tsk) + { ++ struct list_head timers; + struct k_itimer *tmr; + +- while (!list_empty(&sig->posix_timers)) { +- tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); ++ if (list_empty(&tsk->signal->posix_timers)) ++ return; ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ list_replace_init(&tsk->signal->posix_timers, &timers); ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ while (!list_empty(&timers)) { ++ tmr = list_first_entry(&timers, struct k_itimer, list); + itimer_delete(tmr); + } + } diff --git a/patches.suse/posix-cpu-timers-Cleanup-CPU-timers-before-freeing-t.patch b/patches.suse/posix-cpu-timers-Cleanup-CPU-timers-before-freeing-t.patch new file mode 100644 index 0000000..df8a9b5 --- /dev/null +++ b/patches.suse/posix-cpu-timers-Cleanup-CPU-timers-before-freeing-t.patch @@ -0,0 +1,48 @@ +From 9e74142ab2fe94f2b12626357cb2cbe7075b9b4f Mon Sep 17 00:00:00 2001 +From: Thadeu Lima de Souza Cascardo +Date: Fri, 22 Jul 2022 06:52:11 -0300 +Subject: [PATCH] posix-cpu-timers: Cleanup CPU timers before freeing them + during exec +Patch-mainline: Not yet, embargoed +References: CVE-2022-2585 bsc#1202094 + +Commit 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a +task") started looking up tasks by PID when deleting a CPU timer. + +When a non-leader thread calls execve, it will switch PIDs with the leader +process. Then, as it calls exit_itimers, posix_cpu_timer_del cannot find +the task because the timer still points out to the old PID. + +That means that armed timers won't be disarmed, that is, they won't be +removed from the timerqueue_list. exit_itimers will still release their +memory, and when that list is later processed, it leads to a +use-after-free. + +Clean up the timers from the de-threaded task before freeing them. This +prevents a reported use-after-free. + +Fixes: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") +Reported-by: "An independent security researcher working with SSD Secure Disclosure" +Signed-off-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Frederic Weisbecker +--- + fs/exec.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/exec.c b/fs/exec.c +index 778123259e42..1c6b477dad69 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1301,6 +1301,9 @@ int begin_new_exec(struct linux_binprm * bprm) + bprm->mm = NULL; + + #ifdef CONFIG_POSIX_TIMERS ++ spin_lock_irq(&me->sighand->siglock); ++ posix_cpu_timers_exit(me); ++ spin_unlock_irq(&me->sighand->siglock); + exit_itimers(me); + flush_itimer_signals(); + #endif +-- +2.34.1 + diff --git a/patches.suse/sched-core-Do-not-requeue-task-on-CPU-excluded-from-cpus_mask.patch b/patches.suse/sched-core-Do-not-requeue-task-on-CPU-excluded-from-cpus_mask.patch new file mode 100644 index 0000000..c6fcfe7 --- /dev/null +++ b/patches.suse/sched-core-Do-not-requeue-task-on-CPU-excluded-from-cpus_mask.patch @@ -0,0 +1,103 @@ +From 3292d9ee97316c6ad73179c0f9ac1079da5af71b Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Tue, 2 Aug 2022 09:56:58 +0100 +Subject: [PATCH] sched/core: Do not requeue task on CPU excluded from + cpus_mask + +References: bnc#1199356 +Patch-mainline: Not yet, needs to be posted upstream for review + +The following warning was triggered on a large machine early in boot on +a distribution kernel but the same problem should also affect mainline. + +[ 7.732929] ------------[ cut here ]------------ +[ 7.736736] WARNING: CPU: 439 PID: 10 at ../kernel/workqueue.c:2231 process_one_work+0x4d/0x440 +[ 7.748731] Modules linked in: +[ 7.748731] Supported: Yes +[ 7.748731] CPU: 439 PID: 10 Comm: mm_percpu_wq Not tainted +[ 7.748731] Hardware name: HPE Superdome Flex/Superdome Flex, BIOS Bundle:3.60.4 SFW:IP147.007.006.012.000.2202031149 02/03/2022 +[ 7.748731] Workqueue: 0x0 (mm_percpu_wq) +[ 7.748731] RIP: 0010:process_one_work+0x4d/0x440 +[ 7.748731] Code: 00 00 00 00 4c 0f 44 e0 49 8b 44 24 08 44 8b a8 00 01 00 00 41 83 e5 20 f6 45 10 04 75 0e 65 8b 05 98 64 f5 4e 39 45 04 74 02 <0f> 0b 48 ba eb 83 b5 80 46 86 c8 61 48 0f af d6 48 c1 ea 3a 48 8b +[ 7.748731] RSP: 0000:ffffb15980183e88 EFLAGS: 00010093 +[ 7.748731] RAX: 00000000000001b7 RBX: ffff968ec02121f0 RCX: ffff96ddbcbf32a0 +[ 7.748731] RDX: ffff96ddbcbf3298 RSI: ffff96ddbcbeff40 RDI: ffff968ec02121c0 +[ 7.748731] RBP: ffff96ddbcbf3280 R08: ffff96ddbcc745d0 R09: 0000000000000000 +[ 7.748731] R10: ffffb15980183d98 R11: 0000000000000213 R12: ffff96ddbcbfcd00 +[ 7.748731] R13: 0000000000000000 R14: ffff96ddbcbfcd88 R15: ffff968ec02121c0 +[ 7.748731] FS: 0000000000000000(0000) GS:ffff96ddbcc40000(0000) knlGS:0000000000000000 +[ 7.748731] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 7.748731] CR2: 0000000000000000 CR3: 00000028b8810001 CR4: 00000000007706e0 +[ 7.748731] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 7.748731] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 7.748731] PKRU: 55555554 +[ 7.748731] Call Trace: +[ 7.748731] +[ 7.748731] rescuer_thread+0x1f6/0x360 +[ 7.748731] ? cancel_delayed_work+0xa0/0xa0 +[ 7.748731] kthread+0x156/0x180 +[ 7.748731] ? set_kthread_struct+0x40/0x40 +[ 7.748731] ret_from_fork+0x22/0x30 +[ 7.748731] +[ 7.748731] ---[ end trace 9cfa5ca579d758e4 ]--- + +Commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on +p->on_cpu") optimises ttwu by queueing a task that is descheduling +on the wakelist but does not check if the task descheduling is still +allowed to run on that CPU. + +In this warning, the problematic task is a workqueue rescue thread which +checks if the rescue is for a per-cpu workqueue and running on the wrong CPU. +While this is early in boot and it should be possible to create workers, +the rescue thread may still used if the MAYDAY_INITIAL_TIMEOUT is reached +or MAYDAY_INTERVAL and on a sufficiently large machine, the rescue +thread is being used frequently. + +Tracing confirmed that the task should have migrated properly using the +stopper thread to handle the migration. However, a parallel wakeup from +udev running on another CPU observes p->on_cpu and uses task_cpu(p), +queues the task on the old CPU and triggers the warning. + +Check that the wakee task that is descheduling is still allowed to run +on its current CPU and if not, wait for the descheduling to complete +and select an allowed CPU. + +Signed-off-by: Mel Gorman +--- + kernel/sched/core.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 6a4f05af4188..b15d513b5dd1 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3562,7 +3562,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu) + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); + } + +-static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++static inline bool ++ttwu_queue_cond(struct task_struct *p, int cpu, int wake_flags) + { + /* + * Do not complicate things with the async wake_list while the CPU is +@@ -3571,6 +3572,10 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) + if (!cpu_active(cpu)) + return false; + ++ /* Ensure the task will still be allowed to run on the CPU. */ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. +@@ -3592,7 +3597,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) + + static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) + { +- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + diff --git a/patches.suse/x86-speculation-Add-LFENCE-to-RSB-fill-sequence.patch b/patches.suse/x86-speculation-Add-LFENCE-to-RSB-fill-sequence.patch new file mode 100644 index 0000000..4331ff6 --- /dev/null +++ b/patches.suse/x86-speculation-Add-LFENCE-to-RSB-fill-sequence.patch @@ -0,0 +1,66 @@ +From: Pawan Gupta +Date: Tue, 2 Aug 2022 15:47:02 -0700 +Subject: x86/speculation: Add LFENCE to RSB fill sequence +Git-commit: ba6e31af2be96c4d0536f2152ed6f7b6c11bca47 +Patch-mainline: Queued in a subsystem tree +Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git +References: bsc#1201726 CVE-2022-26373 + +RSB fill sequence does not have any protection for miss-prediction of +conditional branch at the end of the sequence. CPU can speculatively +execute code immediately after the sequence, while RSB filling hasn't +completed yet. + + #define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ + 771: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ + call 772f; \ + 773: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ + pause; \ + lfence; \ + jmp 773b; \ + 772: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ + call 774f; \ + 775: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ + pause; \ + lfence; \ + jmp 775b; \ + 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ + dec reg; \ + jnz 771b; <----- CPU can miss-predict here. + +Before RSB is filled, RETs that come in program order after this macro +can be executed speculatively, making them vulnerable to RSB-based +attacks. + +Mitigate it by adding an LFENCE after the conditional branch to prevent +speculation while RSB is being filled. + +Suggested-by: Andrew Cooper +Signed-off-by: Pawan Gupta +Signed-off-by: Borislav Petkov +--- + arch/x86/include/asm/nospec-branch.h | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 4c9ba49d9b3e..d3a3cc6772ee 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -60,7 +60,9 @@ + 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ + dec reg; \ +- jnz 771b; ++ jnz 771b; \ ++ /* barrier for jnz misprediction */ \ ++ lfence; + + #ifdef __ASSEMBLY__ + + diff --git a/patches.suse/x86-speculation-Add-RSB-VM-Exit-protections.patch b/patches.suse/x86-speculation-Add-RSB-VM-Exit-protections.patch new file mode 100644 index 0000000..91d6595 --- /dev/null +++ b/patches.suse/x86-speculation-Add-RSB-VM-Exit-protections.patch @@ -0,0 +1,401 @@ +From: Daniel Sneddon +Date: Tue, 2 Aug 2022 15:47:01 -0700 +Subject: x86/speculation: Add RSB VM Exit protections +Git-commit: 2b1299322016731d56807aa49254a5ea3080b6b3 +Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git +Patch-mainline: Queued in a subsystem tree +References: bsc#1201726 CVE-2022-26373 + +tl;dr: The Enhanced IBRS mitigation for Spectre v2 does not work as +documented for RET instructions after VM exits. Mitigate it with a new +one-entry RSB stuffing mechanism and a new LFENCE. + +== Background == + +Indirect Branch Restricted Speculation (IBRS) was designed to help +mitigate Branch Target Injection and Speculative Store Bypass, i.e. +Spectre, attacks. IBRS prevents software run in less privileged modes +from affecting branch prediction in more privileged modes. IBRS requires +the MSR to be written on every privilege level change. + +To overcome some of the performance issues of IBRS, Enhanced IBRS was +introduced. eIBRS is an "always on" IBRS, in other words, just turn +it on once instead of writing the MSR on every privilege level change. +When eIBRS is enabled, more privileged modes should be protected from +less privileged modes, including protecting VMMs from guests. + +== Problem == + +Here's a simplification of how guests are run on Linux' KVM: + +void run_kvm_guest(void) +{ + // Prepare to run guest + VMRESUME(); + // Clean up after guest runs +} + +The execution flow for that would look something like this to the +processor: + +1. Host-side: call run_kvm_guest() +2. Host-side: VMRESUME +3. Guest runs, does "CALL guest_function" +4. VM exit, host runs again +5. Host might make some "cleanup" function calls +6. Host-side: RET from run_kvm_guest() + +Now, when back on the host, there are a couple of possible scenarios of +post-guest activity the host needs to do before executing host code: + +* on pre-eIBRS hardware (legacy IBRS, or nothing at all), the RSB is not +touched and Linux has to do a 32-entry stuffing. + +* on eIBRS hardware, VM exit with IBRS enabled, or restoring the host +IBRS=1 shortly after VM exit, has a documented side effect of flushing +the RSB except in this PBRSB situation where the software needs to stuff +the last RSB entry "by hand". + +IOW, with eIBRS supported, host RET instructions should no longer be +influenced by guest behavior after the host retires a single CALL +instruction. + +However, if the RET instructions are "unbalanced" with CALLs after a VM +exit as is the RET in #6, it might speculatively use the address for the +instruction after the CALL in #3 as an RSB prediction. This is a problem +since the (untrusted) guest controls this address. + +Balanced CALL/RET instruction pairs such as in step #5 are not affected. + +== Solution == + +The PBRSB issue affects a wide variety of Intel processors which +support eIBRS. But not all of them need mitigation. Today, +X86_FEATURE_RSB_VMEXIT triggers an RSB filling sequence that mitigates +PBRSB. Systems setting RSB_VMEXIT need no further mitigation - i.e., +eIBRS systems which enable legacy IBRS explicitly. + +However, such systems (X86_FEATURE_IBRS_ENHANCED) do not set RSB_VMEXIT +and most of them need a new mitigation. + +Therefore, introduce a new feature flag X86_FEATURE_RSB_VMEXIT_LITE +which triggers a lighter-weight PBRSB mitigation versus RSB_VMEXIT. + +The lighter-weight mitigation performs a CALL instruction which is +immediately followed by a speculative execution barrier (INT3). This +steers speculative execution to the barrier -- just like a retpoline +-- which ensures that speculation can never reach an unbalanced RET. +Then, ensure this CALL is retired before continuing execution with an +LFENCE. + +In other words, the window of exposure is opened at VM exit where RET +behavior is troublesome. While the window is open, force RSB predictions +sampling for RET targets to a dead end at the INT3. Close the window +with the LFENCE. + +There is a subset of eIBRS systems which are not vulnerable to PBRSB. +Add these systems to the cpu_vuln_whitelist[] as NO_EIBRS_PBRSB. +Future systems that aren't vulnerable will set ARCH_CAP_PBRSB_NO. + + [ bp: Massage, incorporate review comments from Andy Cooper. ] + +Signed-off-by: Daniel Sneddon +Co-developed-by: Pawan Gupta +Signed-off-by: Pawan Gupta +Signed-off-by: Borislav Petkov +--- + Documentation/admin-guide/hw-vuln/spectre.rst | 8 ++ + arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/msr-index.h | 4 + + arch/x86/include/asm/nospec-branch.h | 17 ++++- + arch/x86/kernel/cpu/bugs.c | 86 +++++++++++++++++++------- + arch/x86/kernel/cpu/common.c | 12 +++ + arch/x86/kvm/vmx/vmenter.S | 8 +- + tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/msr-index.h | 4 + + 9 files changed, 113 insertions(+), 29 deletions(-) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -301,6 +301,7 @@ + #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ + #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ + #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ ++#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ +@@ -447,5 +448,6 @@ + #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ + #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ + #define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ ++#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -139,6 +139,10 @@ + * bit available to control VERW + * behavior. + */ ++#define ARCH_CAP_PBRSB_NO BIT(24) /* ++ * Not susceptible to Post-Barrier ++ * Return Stack Buffer Predictions. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b + #define L1D_FLUSH BIT(0) /* +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -117,13 +117,28 @@ + #endif + .endm + ++.macro ISSUE_UNBALANCED_RET_GUARD ++ ANNOTATE_INTRA_FUNCTION_CALL ++ call .Lunbalanced_ret_guard_\@ ++ int3 ++.Lunbalanced_ret_guard_\@: ++ add $(BITS_PER_LONG/8), %_ASM_SP ++ lfence ++.endm ++ + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ++.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 ++.ifb \ftr2 + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr ++.else ++ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 ++.endif + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) ++.Lunbalanced_\@: ++ ISSUE_UNBALANCED_RET_GUARD + .Lskip_rsb_\@: + .endm + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -1265,6 +1265,53 @@ static enum spectre_v2_mitigation __init + return SPECTRE_V2_RETPOLINE; + } + ++static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) ++{ ++ /* ++ * Similar to context switches, there are two types of RSB attacks ++ * after VM exit: ++ * ++ * 1) RSB underflow ++ * ++ * 2) Poisoned RSB entry ++ * ++ * When retpoline is enabled, both are mitigated by filling/clearing ++ * the RSB. ++ * ++ * When IBRS is enabled, while #1 would be mitigated by the IBRS branch ++ * prediction isolation protections, RSB still needs to be cleared ++ * because of #2. Note that SMEP provides no protection here, unlike ++ * user-space-poisoned RSB entries. ++ * ++ * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB ++ * bug is present then a LITE version of RSB protection is required, ++ * just a single call needs to retire before a RET is executed. ++ */ ++ switch (mode) { ++ case SPECTRE_V2_NONE: ++ return; ++ ++ case SPECTRE_V2_EIBRS_LFENCE: ++ case SPECTRE_V2_EIBRS: ++ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { ++ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); ++ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); ++ } ++ return; ++ ++ case SPECTRE_V2_EIBRS_RETPOLINE: ++ case SPECTRE_V2_RETPOLINE: ++ case SPECTRE_V2_LFENCE: ++ case SPECTRE_V2_IBRS: ++ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); ++ pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); ++ return; ++ } ++ ++ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); ++ dump_stack(); ++} ++ + static void __init spectre_v2_select_mitigation(void) + { + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); +@@ -1402,28 +1449,7 @@ static void __init spectre_v2_select_mit + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + +- /* +- * Similar to context switches, there are two types of RSB attacks +- * after vmexit: +- * +- * 1) RSB underflow +- * +- * 2) Poisoned RSB entry +- * +- * When retpoline is enabled, both are mitigated by filling/clearing +- * the RSB. +- * +- * When IBRS is enabled, while #1 would be mitigated by the IBRS branch +- * prediction isolation protections, RSB still needs to be cleared +- * because of #2. Note that SMEP provides no protection here, unlike +- * user-space-poisoned RSB entries. +- * +- * eIBRS, on the other hand, has RSB-poisoning protections, so it +- * doesn't need RSB clearing after vmexit. +- */ +- if (boot_cpu_has(X86_FEATURE_RETPOLINE) || +- boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) +- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); ++ spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + + /* + * Retpoline protects the kernel, but doesn't protect firmware. IBRS +@@ -2155,6 +2181,19 @@ static char *ibpb_state(void) + return ""; + } + ++static char *pbrsb_eibrs_state(void) ++{ ++ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { ++ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || ++ boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) ++ return ", PBRSB-eIBRS: SW sequence"; ++ else ++ return ", PBRSB-eIBRS: Vulnerable"; ++ } else { ++ return ", PBRSB-eIBRS: Not affected"; ++ } ++} ++ + static ssize_t spectre_v2_show_state(char *buf) + { + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +@@ -2167,12 +2206,13 @@ static ssize_t spectre_v2_show_state(cha + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + +- return sprintf(buf, "%s%s%s%s%s%s\n", ++ return sprintf(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", ++ pbrsb_eibrs_state(), + spectre_v2_module_string()); + } + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1030,6 +1030,7 @@ static void identify_cpu_without_cpuid(s + #define NO_SWAPGS BIT(6) + #define NO_ITLB_MULTIHIT BIT(7) + #define NO_SPECTRE_V2 BIT(8) ++#define NO_EIBRS_PBRSB BIT(9) + + #define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) +@@ -1070,7 +1071,7 @@ static const __initconst struct x86_cpu_ + + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), +- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously +@@ -1080,7 +1081,9 @@ static const __initconst struct x86_cpu_ + * good enough for our purposes. + */ + +- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), ++ VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), ++ VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + + /* AMD Family 0xf - 0x12 */ + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), +@@ -1258,6 +1261,11 @@ static void __init cpu_set_bug_bits(stru + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + ++ if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && ++ !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && ++ !(ia32_cap & ARCH_CAP_PBRSB_NO)) ++ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); ++ + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) + return; + +--- a/arch/x86/kvm/vmx/vmenter.S ++++ b/arch/x86/kvm/vmx/vmenter.S +@@ -226,11 +226,13 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL + * entries and (in some cases) RSB underflow. + * + * eIBRS has its own protection against poisoned RSB, so it doesn't +- * need the RSB filling sequence. But it does need to be enabled +- * before the first unbalanced RET. ++ * need the RSB filling sequence. But it does need to be enabled, and a ++ * single call to retire, before the first unbalanced RET. + */ + +- FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT ++ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ ++ X86_FEATURE_RSB_VMEXIT_LITE ++ + + pop %_ASM_ARG2 /* @flags */ + pop %_ASM_ARG1 /* @vmx */ +--- a/Documentation/admin-guide/hw-vuln/spectre.rst ++++ b/Documentation/admin-guide/hw-vuln/spectre.rst +@@ -422,6 +422,14 @@ The possible values in this file are: + 'RSB filling' Protection of RSB on context switch enabled + ============= =========================================== + ++ - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status: ++ ++ =========================== ======================================================= ++ 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled ++ 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable ++ 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB ++ =========================== ======================================================= ++ + Full mitigation might require a microcode update from the CPU + vendor. When the necessary microcode is not available, the kernel will + report vulnerability. +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -294,6 +294,7 @@ + #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ + #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ + #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ ++#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ +--- a/tools/arch/x86/include/asm/msr-index.h ++++ b/tools/arch/x86/include/asm/msr-index.h +@@ -138,6 +138,10 @@ + * bit available to control VERW + * behavior. + */ ++#define ARCH_CAP_PBRSB_NO BIT(24) /* ++ * Not susceptible to Post-Barrier ++ * Return Stack Buffer Predictions. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b + #define L1D_FLUSH BIT(0) /* diff --git a/series.conf b/series.conf index 40bcba3..ab1a310 100644 --- a/series.conf +++ b/series.conf @@ -12246,6 +12246,7 @@ patches.suse/dmaengine-lgm-Fix-an-error-handling-path-in-intel_ld.patch patches.suse/dt-bindings-dma-allwinner-sun50i-a64-dma-Fix-min-max.patch patches.suse/ida-don-t-use-BUG_ON-for-debugging.patch + patches.suse/fix-race-between-exit_itimers-and-proc-pid-timers.patch patches.suse/x86-kvm-vmx-Make-noinstr-clean.patch patches.suse/x86-cpufeatures-Move-RETPOLINE-flags-to-word-11.patch patches.suse/x86-retpoline-Cleanup-some-ifdefery.patch @@ -12470,6 +12471,10 @@ patches.suse/ath9k-fix-use-after-free-in-ath9k_hif_usb_rx_cb.patch + # tip + patches.suse/x86-speculation-Add-RSB-VM-Exit-protections.patch + patches.suse/x86-speculation-Add-LFENCE-to-RSB-fill-sequence.patch + ######################################################## # kbuild/module infrastructure fixes ######################################################## @@ -12554,6 +12559,12 @@ # bnc#1193175 patches.suse/sched-fair-Revert-update_pick_idlest-Select-group-with-lowest-group_util-when-idle_cpus-are-equal.patch + # bnc#1199356 + patches.suse/sched-core-Do-not-requeue-task-on-CPU-excluded-from-cpus_mask.patch + + # bsc#1202094 + patches.suse/posix-cpu-timers-Cleanup-CPU-timers-before-freeing-t.patch + ######################################################## # Memory management ########################################################