diff --git a/patches.kabi/bpf-prevent-memory-disambiguation-attack.patch b/patches.kabi/bpf-prevent-memory-disambiguation-attack.patch index cb30c6e..75e1935 100644 --- a/patches.kabi/bpf-prevent-memory-disambiguation-attack.patch +++ b/patches.kabi/bpf-prevent-memory-disambiguation-attack.patch @@ -5,20 +5,25 @@ References: bsc#1087082 Signed-off-by: Jiri Kosina --- +Replace `int sanitize_stack_off` with `bool sanitize_stack_spill` to adapte to +changes in introduced in "bpf: Fix leakage due to insufficient speculative +store bypass mitigation" + +Signed-off-by: Shung-Hsi Yu +--- include/linux/bpf_verifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h -@@ -127,9 +127,11 @@ struct bpf_insn_aux_data { - u32 alu_limit; /* limit for add/sub register with pointer */ +@@ -130,8 +130,10 @@ struct bpf_insn_aux_data { }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ -- int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ +- bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ u8 alu_state; /* used in combination with alu_limit */ +#ifndef __GENKSYMS__ -+ int sanitize_stack_off; /* stack slot to be cleared */ ++ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ +#endif }; diff --git a/patches.kabi/bpf-prevent-out-of-bounds-speculation-on-pointer-ari.patch b/patches.kabi/bpf-prevent-out-of-bounds-speculation-on-pointer-ari.patch index ab5c697..ea4bc78 100644 --- a/patches.kabi/bpf-prevent-out-of-bounds-speculation-on-pointer-ari.patch +++ b/patches.kabi/bpf-prevent-out-of-bounds-speculation-on-pointer-ari.patch @@ -5,6 +5,12 @@ References: bsc#1068032 CVE-2017-5753 Signed-off-by: Gary Lin --- +Replace `int sanitize_stack_off` with `bool sanitize_stack_spill` to adapte to +changes in introduced in "bpf: Fix leakage due to insufficient speculative +store bypass mitigation" + +Signed-off-by: Shung-Hsi Yu +--- include/linux/bpf_verifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) @@ -20,13 +26,13 @@ Signed-off-by: Gary Lin }; /* linked list of verifier states used to prune search */ -@@ -134,9 +136,9 @@ struct bpf_insn_aux_data { +@@ -136,9 +138,9 @@ struct bpf_insn_aux_data { }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ bool seen; /* this insn was processed by the verifier */ - u8 alu_state; /* used in combination with alu_limit */ #ifndef __GENKSYMS__ - int sanitize_stack_off; /* stack slot to be cleared */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ + u8 alu_state; /* used in combination with alu_limit */ #endif }; diff --git a/patches.suse/bpf-Fix-leakage-due-to-insufficient-speculative-stor.patch b/patches.suse/bpf-Fix-leakage-due-to-insufficient-speculative-stor.patch new file mode 100644 index 0000000..85511be --- /dev/null +++ b/patches.suse/bpf-Fix-leakage-due-to-insufficient-speculative-stor.patch @@ -0,0 +1,427 @@ +From: Daniel Borkmann +Date: Tue, 13 Jul 2021 08:18:31 +0000 +Subject: bpf: Fix leakage due to insufficient speculative store bypass + mitigation +Patch-mainline: v5.14-rc4 +Git-commit: 2039f26f3aca5b0e419b98f65dd36481337b86ee +References: bsc#1188983, bsc#1188985, CVE-2021-34556, CVE-2021-35477 + +Spectre v4 gadgets make use of memory disambiguation, which is a set of +techniques that execute memory access instructions, that is, loads and +stores, out of program order; Intel's optimization manual, section 2.4.4.5: + + A load instruction micro-op may depend on a preceding store. Many + microarchitectures block loads until all preceding store addresses are + known. The memory disambiguator predicts which loads will not depend on + any previous stores. When the disambiguator predicts that a load does + not have such a dependency, the load takes its data from the L1 data + cache. Eventually, the prediction is verified. If an actual conflict is + detected, the load and all succeeding instructions are re-executed. + +af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate +this attack by sanitizing the memory locations through preemptive "fast" +(low latency) stores of zero prior to the actual "slow" (high latency) store +of a pointer value such that upon dependency misprediction the CPU then +speculatively executes the load of the pointer value and retrieves the zero +value instead of the attacker controlled scalar value previously stored at +that location, meaning, subsequent access in the speculative domain is then +redirected to the "zero page". + +The sanitized preemptive store of zero prior to the actual "slow" store is +done through a simple ST instruction based on r10 (frame pointer) with +relative offset to the stack location that the verifier has been tracking +on the original used register for STX, which does not have to be r10. Thus, +there are no memory dependencies for this store, since it's only using r10 +and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency +operation. + +However, a recent attack demonstrated that this mitigation is not sufficient +since the preemptive store of zero could also be turned into a "slow" store +and is thus bypassed as well: + + [...] + // r2 = oob address (e.g. scalar) + // r7 = pointer to map value + 31: (7b) *(u64 *)(r10 -16) = r2 + // r9 will remain "fast" register, r10 will become "slow" register below + 32: (bf) r9 = r10 + // JIT maps BPF reg to x86 reg: + // r9 -> r15 (callee saved) + // r10 -> rbp + // train store forward prediction to break dependency link between both r9 + // and r10 by evicting them from the predictor's LRU table. + 33: (61) r0 = *(u32 *)(r7 +24576) + 34: (63) *(u32 *)(r7 +29696) = r0 + 35: (61) r0 = *(u32 *)(r7 +24580) + 36: (63) *(u32 *)(r7 +29700) = r0 + 37: (61) r0 = *(u32 *)(r7 +24584) + 38: (63) *(u32 *)(r7 +29704) = r0 + 39: (61) r0 = *(u32 *)(r7 +24588) + 40: (63) *(u32 *)(r7 +29708) = r0 + [...] + 543: (61) r0 = *(u32 *)(r7 +25596) + 544: (63) *(u32 *)(r7 +30716) = r0 + // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp + // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain + // in hardware registers. rbp becomes slow due to push/pop latency. below is + // disasm of bpf_ringbuf_output() helper for better visual context: + // + // ffffffff8117ee20: 41 54 push r12 + // ffffffff8117ee22: 55 push rbp + // ffffffff8117ee23: 53 push rbx + // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc + // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken + // [...] + // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea + // ffffffff8117eee7: 5b pop rbx + // ffffffff8117eee8: 5d pop rbp + // ffffffff8117eee9: 4c 89 e0 mov rax,r12 + // ffffffff8117eeec: 41 5c pop r12 + // ffffffff8117eeee: c3 ret + 545: (18) r1 = map[id:4] + 547: (bf) r2 = r7 + 548: (b7) r3 = 0 + 549: (b7) r4 = 4 + 550: (85) call bpf_ringbuf_output#194288 + // instruction 551 inserted by verifier \ + 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here + // storing map value pointer r7 at fp-16 | since value of r10 is "slow". + 552: (7b) *(u64 *)(r10 -16) = r7 / + // following "fast" read to the same memory location, but due to dependency + // misprediction it will speculatively execute before insn 551/552 completes. + 553: (79) r2 = *(u64 *)(r9 -16) + // in speculative domain contains attacker controlled r2. in non-speculative + // domain this contains r7, and thus accesses r7 +0 below. + 554: (71) r3 = *(u8 *)(r2 +0) + // leak r3 + +As can be seen, the current speculative store bypass mitigation which the +verifier inserts at line 551 is insufficient since /both/, the write of +the zero sanitation as well as the map value pointer are a high latency +instruction due to prior memory access via push/pop of r10 (rbp) in contrast +to the low latency read in line 553 as r9 (r15) which stays in hardware +registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, +fp-16 can still be r2. + +Initial thoughts to address this issue was to track spilled pointer loads +from stack and enforce their load via LDX through r10 as well so that /both/ +the preemptive store of zero /as well as/ the load use the /same/ register +such that a dependency is created between the store and load. However, this +option is not sufficient either since it can be bypassed as well under +speculation. An updated attack with pointer spill/fills now _all_ based on +r10 would look as follows: + + [...] + // r2 = oob address (e.g. scalar) + // r7 = pointer to map value + [...] + // longer store forward prediction training sequence than before. + 2062: (61) r0 = *(u32 *)(r7 +25588) + 2063: (63) *(u32 *)(r7 +30708) = r0 + 2064: (61) r0 = *(u32 *)(r7 +25592) + 2065: (63) *(u32 *)(r7 +30712) = r0 + 2066: (61) r0 = *(u32 *)(r7 +25596) + 2067: (63) *(u32 *)(r7 +30716) = r0 + // store the speculative load address (scalar) this time after the store + // forward prediction training. + 2068: (7b) *(u64 *)(r10 -16) = r2 + // preoccupy the CPU store port by running sequence of dummy stores. + 2069: (63) *(u32 *)(r7 +29696) = r0 + 2070: (63) *(u32 *)(r7 +29700) = r0 + 2071: (63) *(u32 *)(r7 +29704) = r0 + 2072: (63) *(u32 *)(r7 +29708) = r0 + 2073: (63) *(u32 *)(r7 +29712) = r0 + 2074: (63) *(u32 *)(r7 +29716) = r0 + 2075: (63) *(u32 *)(r7 +29720) = r0 + 2076: (63) *(u32 *)(r7 +29724) = r0 + 2077: (63) *(u32 *)(r7 +29728) = r0 + 2078: (63) *(u32 *)(r7 +29732) = r0 + 2079: (63) *(u32 *)(r7 +29736) = r0 + 2080: (63) *(u32 *)(r7 +29740) = r0 + 2081: (63) *(u32 *)(r7 +29744) = r0 + 2082: (63) *(u32 *)(r7 +29748) = r0 + 2083: (63) *(u32 *)(r7 +29752) = r0 + 2084: (63) *(u32 *)(r7 +29756) = r0 + 2085: (63) *(u32 *)(r7 +29760) = r0 + 2086: (63) *(u32 *)(r7 +29764) = r0 + 2087: (63) *(u32 *)(r7 +29768) = r0 + 2088: (63) *(u32 *)(r7 +29772) = r0 + 2089: (63) *(u32 *)(r7 +29776) = r0 + 2090: (63) *(u32 *)(r7 +29780) = r0 + 2091: (63) *(u32 *)(r7 +29784) = r0 + 2092: (63) *(u32 *)(r7 +29788) = r0 + 2093: (63) *(u32 *)(r7 +29792) = r0 + 2094: (63) *(u32 *)(r7 +29796) = r0 + 2095: (63) *(u32 *)(r7 +29800) = r0 + 2096: (63) *(u32 *)(r7 +29804) = r0 + 2097: (63) *(u32 *)(r7 +29808) = r0 + 2098: (63) *(u32 *)(r7 +29812) = r0 + // overwrite scalar with dummy pointer; same as before, also including the + // sanitation store with 0 from the current mitigation by the verifier. + 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here + 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. + // load from stack intended to bypass stores. + 2101: (79) r2 = *(u64 *)(r10 -16) + 2102: (71) r3 = *(u8 *)(r2 +0) + // leak r3 + [...] + +Looking at the CPU microarchitecture, the scheduler might issue loads (such +as seen in line 2101) before stores (line 2099,2100) because the load execution +units become available while the store execution unit is still busy with the +sequence of dummy stores (line 2069-2098). And so the load may use the prior +stored scalar from r2 at address r10 -16 for speculation. The updated attack +may work less reliable on CPU microarchitectures where loads and stores share +execution resources. + +This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: +Prevent memory disambiguation attack") is insufficient. Moreover, the detection +of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been +written to a given stack slot where a pointer value is now to be stored does +not have sufficient coverage as precondition for the mitigation either; for +several reasons outlined as follows: + + 1) Stack content from prior program runs could still be preserved and is + therefore not "random", best example is to split a speculative store + bypass attack between tail calls, program A would prepare and store the + oob address at a given stack slot and then tail call into program B which + does the "slow" store of a pointer to the stack with subsequent "fast" + read. From program B PoV such stack slot type is STACK_INVALID, and + therefore also must be subject to mitigation. + + 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) + condition, for example, the previous content of that memory location could + also be a pointer to map or map value. Without the fix, a speculative + store bypass is not mitigated in such precondition and can then lead to + a type confusion in the speculative domain leaking kernel memory near + these pointer types. + +While brainstorming on various alternative mitigation possibilities, we also +stumbled upon a retrospective from Chrome developers [0]: + + [...] For variant 4, we implemented a mitigation to zero the unused memory + of the heap prior to allocation, which cost about 1% when done concurrently + and 4% for scavenging. Variant 4 defeats everything we could think of. We + explored more mitigations for variant 4 but the threat proved to be more + pervasive and dangerous than we anticipated. For example, stack slots used + by the register allocator in the optimizing compiler could be subject to + type confusion, leading to pointer crafting. Mitigating type confusion for + stack slots alone would have required a complete redesign of the backend of + the optimizing compiler, perhaps man years of work, without a guarantee of + completeness. [...] + +From BPF side, the problem space is reduced, however, options are rather +limited. One idea that has been explored was to xor-obfuscate pointer spills +to the BPF stack: + + [...] + // preoccupy the CPU store port by running sequence of dummy stores. + [...] + 2106: (63) *(u32 *)(r7 +29796) = r0 + 2107: (63) *(u32 *)(r7 +29800) = r0 + 2108: (63) *(u32 *)(r7 +29804) = r0 + 2109: (63) *(u32 *)(r7 +29808) = r0 + 2110: (63) *(u32 *)(r7 +29812) = r0 + // overwrite scalar with dummy pointer; xored with random 'secret' value + // of 943576462 before store ... + 2111: (b4) w11 = 943576462 + 2112: (af) r11 ^= r7 + 2113: (7b) *(u64 *)(r10 -16) = r11 + 2114: (79) r11 = *(u64 *)(r10 -16) + 2115: (b4) w2 = 943576462 + 2116: (af) r2 ^= r11 + // ... and restored with the same 'secret' value with the help of AX reg. + 2117: (71) r3 = *(u8 *)(r2 +0) + [...] + +While the above would not prevent speculation, it would make data leakage +infeasible by directing it to random locations. In order to be effective +and prevent type confusion under speculation, such random secret would have +to be regenerated for each store. The additional complexity involved for a +tracking mechanism that prevents jumps such that restoring spilled pointers +would not get corrupted is not worth the gain for unprivileged. Hence, the +fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC +instruction which the x86 JIT translates into a lfence opcode. Inserting the +latter in between the store and load instruction is one of the mitigations +options [1]. The x86 instruction manual notes: + + [...] An LFENCE that follows an instruction that stores to memory might + complete before the data being stored have become globally visible. [...] + +The latter meaning that the preceding store instruction finished execution +and the store is at minimum guaranteed to be in the CPU's store queue, but +it's not guaranteed to be in that CPU's L1 cache at that point (globally +visible). The latter would only be guaranteed via sfence. So the load which +is guaranteed to execute after the lfence for that local CPU would have to +rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: + + [...] For every store operation that is added to the ROB, an entry is + allocated in the store buffer. This entry requires both the virtual and + physical address of the target. Only if there is no free entry in the store + buffer, the frontend stalls until there is an empty slot available in the + store buffer again. Otherwise, the CPU can immediately continue adding + subsequent instructions to the ROB and execute them out of order. On Intel + CPUs, the store buffer has up to 56 entries. [...] + +One small upside on the fix is that it lifts constraints from af86ca4e3088 +where the sanitize_stack_off relative to r10 must be the same when coming +from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX +or BPF_ST instruction. This happens either when we store a pointer or data +value to the BPF stack for the first time, or upon later pointer spills. +The former needs to be enforced since otherwise stale stack data could be +leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | +BPF_NOSPEC mapping is currently optimized away, but others could emit a +speculation barrier as well if necessary. For real-world unprivileged +programs e.g. generated by LLVM, pointer spill/fill is only generated upon +register pressure and LLVM only tries to do that for pointers which are not +used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC +sanitation for the STACK_INVALID case when the first write to a stack slot +occurs e.g. upon map lookup. In future we might refine ways to mitigate +the latter cost. + + [0] https://arxiv.org/pdf/1902.05178.pdf + [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ + [2] https://arxiv.org/pdf/1905.05725.pdf + +Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") +Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") +Co-developed-by: Piotr Krysiuk +Co-developed-by: Benedict Schlueter +Signed-off-by: Daniel Borkmann +Signed-off-by: Piotr Krysiuk +Signed-off-by: Benedict Schlueter +Acked-by: Alexei Starovoitov +Acked-by: Shung-Hsi Yu +--- + include/linux/bpf_verifier.h | 2 - + kernel/bpf/verifier.c | 76 +++++++++++++++++++------------------------ + 2 files changed, 35 insertions(+), 43 deletions(-) + +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -129,8 +129,8 @@ struct bpf_insn_aux_data { + u32 alu_limit; /* limit for add/sub register with pointer */ + }; + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ +- int sanitize_stack_off; /* stack slot to be cleared */ + bool seen; /* this insn was processed by the verifier */ ++ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ + u8 alu_state; /* used in combination with alu_limit */ + }; + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -706,6 +706,20 @@ static int check_stack_write(struct bpf_ + return -EACCES; + } + ++ if (!env->allow_ptr_leaks) { ++ bool sanitize = value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type); ++ ++ for (i = 0; i < size; i++) { ++ if (state->stack[spi].slot_type[i] == STACK_INVALID) { ++ sanitize = true; ++ break; ++ } ++ } ++ ++ if (sanitize) ++ env->insn_aux_data[insn_idx].sanitize_stack_spill = true; ++ } ++ + if (value_regno >= 0 && + is_spillable_regtype(state->regs[value_regno].type)) { + +@@ -720,30 +734,6 @@ static int check_stack_write(struct bpf_ + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) { +- if (state->stack[spi].slot_type[i] == STACK_MISC && +- !env->allow_ptr_leaks) { +- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; +- int soff = (-spi - 1) * BPF_REG_SIZE; +- +- /* detected reuse of integer stack slot with a pointer +- * which means either llvm is reusing stack slot or +- * an attacker is trying to exploit CVE-2018-3639 +- * (speculative store bypass) +- * Have to sanitize that slot with preemptive +- * store of zero. +- */ +- if (*poff && *poff != soff) { +- /* disallow programs where single insn stores +- * into two different stack slots, since verifier +- * cannot sanitize them +- */ +- verbose(env, +- "insn %d cannot access two stack slots fp%d and fp%d", +- insn_idx, *poff, soff); +- return -EINVAL; +- } +- *poff = soff; +- } + state->stack[spi].slot_type[i] = STACK_SPILL; + } + } else { +@@ -4644,34 +4634,33 @@ static int convert_ctx_accesses(struct b + insn = env->prog->insnsi + delta; + + for (i = 0; i < insn_cnt; i++, insn++) { ++ bool ctx_access; ++ + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || +- insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) ++ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { + type = BPF_READ; +- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || +- insn->code == (BPF_STX | BPF_MEM | BPF_H) || +- insn->code == (BPF_STX | BPF_MEM | BPF_W) || +- insn->code == (BPF_STX | BPF_MEM | BPF_DW)) ++ ctx_access = true; ++ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_H) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_W) || ++ insn->code == (BPF_STX | BPF_MEM | BPF_DW) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_B) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_H) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_W) || ++ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { + type = BPF_WRITE; +- else ++ ctx_access = BPF_CLASS(insn->code) == BPF_STX; ++ } else { + continue; ++ } + + if (type == BPF_WRITE && +- env->insn_aux_data[i + delta].sanitize_stack_off) { ++ env->insn_aux_data[i + delta].sanitize_stack_spill) { + struct bpf_insn patch[] = { +- /* Sanitize suspicious stack slot with zero. +- * There are no memory dependencies for this store, +- * since it's only using frame pointer and immediate +- * constant of zero +- */ +- BPF_ST_MEM(BPF_DW, BPF_REG_FP, +- env->insn_aux_data[i + delta].sanitize_stack_off, +- 0), +- /* the original STX instruction will immediately +- * overwrite the same stack slot with appropriate value +- */ + *insn, ++ BPF_ST_NOSPEC(), + }; + + cnt = ARRAY_SIZE(patch); +@@ -4685,6 +4674,9 @@ static int convert_ctx_accesses(struct b + continue; + } + ++ if (!ctx_access) ++ continue; ++ + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + continue; + diff --git a/patches.suse/bpf-Introduce-BPF-nospec-instruction-for-mitigating-.patch b/patches.suse/bpf-Introduce-BPF-nospec-instruction-for-mitigating-.patch new file mode 100644 index 0000000..a65caec --- /dev/null +++ b/patches.suse/bpf-Introduce-BPF-nospec-instruction-for-mitigating-.patch @@ -0,0 +1,223 @@ +From: Daniel Borkmann +Date: Tue, 13 Jul 2021 08:18:31 +0000 +Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 +Patch-mainline: v5.14-rc4 +Git-commit: f5e81d1117501546b7be050c5fbafa6efd2c722c +References: bsc#1188983, bsc#1188985, CVE-2021-34556, CVE-2021-35477 + +In case of JITs, each of the JIT backends compiles the BPF nospec instruction +/either/ to a machine instruction which emits a speculation barrier /or/ to +/no/ machine instruction in case the underlying architecture is not affected +by Speculative Store Bypass or has different mitigations in place already. + +This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' +instruction for mitigation. In case of arm64, we rely on the firmware mitigation +as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, +it works for all of the kernel code with no need to provide any additional +instructions here (hence only comment in arm64 JIT). Other archs can follow +as needed. The BPF nospec instruction is specifically targeting Spectre v4 +since i) we don't use a serialization barrier for the Spectre v1 case, and +ii) mitigation instructions for v1 and v4 might be different on some archs. + +The BPF nospec is required for a future commit, where the BPF verifier does +annotate intermediate BPF programs with speculation barriers. + +Co-developed-by: Piotr Krysiuk +Co-developed-by: Benedict Schlueter +Signed-off-by: Daniel Borkmann +Signed-off-by: Piotr Krysiuk +Signed-off-by: Benedict Schlueter +Acked-by: Alexei Starovoitov +Acked-by: Shung-Hsi Yu +--- +syu: Drop RISCV, ARM and x86 arch code. +--- + arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ + arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ + arch/s390/net/bpf_jit_comp.c | 5 +++++ + arch/sparc/net/bpf_jit_comp_64.c | 3 +++ + arch/x86/net/bpf_jit_comp.c | 7 +++++++ + include/linux/filter.h | 15 +++++++++++++++ + kernel/bpf/core.c | 19 ++++++++++++++++++- + kernel/bpf/disasm.c | 15 +++++++++------ + 8 files changed, 76 insertions(+), 7 deletions(-) + +--- a/arch/arm64/net/bpf_jit_comp.c ++++ b/arch/arm64/net/bpf_jit_comp.c +@@ -661,6 +661,19 @@ emit_cond_jmp: + } + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ /* ++ * Nothing required here. ++ * ++ * In case of arm64, we rely on the firmware mitigation of ++ * Speculative Store Bypass as controlled via the ssbd kernel ++ * parameter. Whenever the mitigation is enabled, it works ++ * for all of the kernel code with no need to provide any ++ * additional instructions. ++ */ ++ break; ++ + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -645,6 +645,12 @@ emit_clear: + break; + + /* ++ * BPF_ST NOSPEC (speculation barrier) ++ */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ ++ /* + * BPF_ST(X) + */ + case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src */ +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -926,6 +926,11 @@ static noinline int bpf_jit_insn(struct + } + break; + /* ++ * BPF_NOSPEC (speculation barrier) ++ */ ++ case BPF_ST | BPF_NOSPEC: ++ break; ++ /* + * BPF_ST(X) + */ + case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src_reg */ +--- a/arch/sparc/net/bpf_jit_comp_64.c ++++ b/arch/sparc/net/bpf_jit_comp_64.c +@@ -1277,6 +1277,9 @@ static int build_insn(const struct bpf_i + emit(opcode | RS1(src) | rs2 | RD(dst), ctx); + break; + } ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ break; + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -707,6 +707,13 @@ static int do_jit(struct bpf_prog *bpf_p + } + break; + ++ /* speculation barrier */ ++ case BPF_ST | BPF_NOSPEC: ++ if (boot_cpu_has(X86_FEATURE_XMM2)) ++ /* Emit 'lfence' */ ++ EMIT3(0x0F, 0xAE, 0xE8); ++ break; ++ + /* ST: *(u8*)(dst_reg + off) = imm */ + case BPF_ST | BPF_MEM | BPF_B: + if (is_ereg(dst_reg)) +--- a/include/linux/filter.h ++++ b/include/linux/filter.h +@@ -56,6 +56,11 @@ struct bpf_prog_aux; + /* unused opcode to mark special call to bpf_tail_call() helper */ + #define BPF_TAIL_CALL 0xf0 + ++/* unused opcode to mark speculation barrier for mitigating ++ * Speculative Store Bypass ++ */ ++#define BPF_NOSPEC 0xc0 ++ + /* As per nm, we expose JITed images as text (code) section for + * kallsyms. That way, tools like perf can find it to match + * addresses. +@@ -307,6 +312,16 @@ struct bpf_prog_aux; + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ ++ .imm = 0 }) ++ ++/* Speculation barrier */ ++ ++#define BPF_ST_NOSPEC() \ ++ ((struct bpf_insn) { \ ++ .code = BPF_ST | BPF_NOSPEC, \ ++ .dst_reg = 0, \ ++ .src_reg = 0, \ ++ .off = 0, \ + .imm = 0 }) + + /* Internal classic blocks for direct assignment */ +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -32,6 +32,7 @@ + #include + #include + ++#include + #include + + /* Registers */ +@@ -887,6 +888,8 @@ static unsigned int ___bpf_prog_run(u64 + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, ++ /* NOSPEC instruction */ ++ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, +@@ -1197,7 +1200,21 @@ out: + JMP_EXIT: + return BPF_R0; + +- /* STX and ST and LDX*/ ++ /* ST, STX and LDX*/ ++ ST_NOSPEC: ++ /* Speculation barrier for mitigating Speculative Store Bypass. ++ * In case of arm64, we rely on the firmware mitigation as ++ * controlled via the ssbd kernel parameter. Whenever the ++ * mitigation is enabled, it works for all of the kernel code ++ * with no need to provide any additional instructions here. ++ * In case of x86, we use 'lfence' insn for mitigation. We ++ * reuse preexisting logic from Spectre v1 mitigation that ++ * happens to produce the required code on x86 for v4 as well. ++ */ ++#ifdef CONFIG_X86 ++ barrier_nospec(); ++#endif ++ CONT; + #define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ +--- a/kernel/bpf/disasm.c ++++ b/kernel/bpf/disasm.c +@@ -139,15 +139,18 @@ void print_bpf_insn(bpf_insn_print_cb ve + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { +- if (BPF_MODE(insn->code) != BPF_MEM) { ++ if (BPF_MODE(insn->code) == BPF_MEM) { ++ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", ++ insn->code, ++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++ insn->dst_reg, ++ insn->off, insn->imm); ++ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { ++ verbose(env, "(%02x) nospec\n", insn->code); ++ } else { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } +- verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", +- insn->code, +- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +- insn->dst_reg, +- insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); diff --git a/series.conf b/series.conf index 0b3bbc7..a2ae124 100644 --- a/series.conf +++ b/series.conf @@ -26529,6 +26529,8 @@ patches.suse/usb-max-3421-Prevent-corruption-of-freed-memory.patch patches.suse/tracing-Fix-bug-in-rb_per_cpu_empty-that-might-cause.patch patches.suse/KVM-PPC-Book3S-Fix-H_RTAS-rets-buffer-overflow.patch + patches.suse/bpf-Introduce-BPF-nospec-instruction-for-mitigating-.patch + patches.suse/bpf-Fix-leakage-due-to-insufficient-speculative-stor.patch patches.suse/Bluetooth-defer-cleanup-of-resources-in-hci_unregist.patch patches.suse/ovl-prevent-private-clone-if-bind-mount-is-not-allow.patch patches.suse/vt_kdsetmode-extend-console-locking.patch