From ebb75ee83c74c48af44857be4f0972a206fc46d8 Mon Sep 17 00:00:00 2001 From: Kernel Build Daemon Date: Nov 02 2021 07:01:22 +0000 Subject: Merge branch 'SLE15-SP2' into SLE15-SP3 --- diff --git a/patches.suse/KVM-PPC-Book3S-HV-Nested-Reflect-guest-PMU-in-use-to.patch b/patches.suse/KVM-PPC-Book3S-HV-Nested-Reflect-guest-PMU-in-use-to.patch new file mode 100644 index 0000000..43e59e2 --- /dev/null +++ b/patches.suse/KVM-PPC-Book3S-HV-Nested-Reflect-guest-PMU-in-use-to.patch @@ -0,0 +1,101 @@ +From 1782663897945a5cf28e564ba5eed730098e9aa4 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Thu, 12 Aug 2021 02:00:43 +1000 +Subject: [PATCH] KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 + when guest SPRs are live + +References: bsc#1156395 +Patch-mainline: v5.15-rc1 +Git-commit: 1782663897945a5cf28e564ba5eed730098e9aa4 + +After the L1 saves its PMU SPRs but before loading the L2's PMU SPRs, +switch the pmcregs_in_use field in the L1 lppaca to the value advertised +by the L2 in its VPA. On the way out of the L2, set it back after saving +the L2 PMU registers (if they were in-use). + +This transfers the PMU liveness indication between the L1 and L2 at the +points where the registers are not live. + +This fixes the nested HV bug for which a workaround was added to the L0 +HV by commit 63279eeb7f93a ("KVM: PPC: Book3S HV: Always save guest pmu +for guest capable of nesting"), which explains the problem in detail. +That workaround is no longer required for guests that include this bug +fix. + +Fixes: 360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall") +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20210811160134.904987-10-npiggin@gmail.com +Acked-by: Michal Suchanek +--- + arch/powerpc/include/asm/pmc.h | 7 +++++++ + arch/powerpc/kvm/book3s_hv.c | 20 ++++++++++++++++++++ + 2 files changed, 27 insertions(+) + +diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h +index c6bbe9778d3c..3c09109e708e 100644 +--- a/arch/powerpc/include/asm/pmc.h ++++ b/arch/powerpc/include/asm/pmc.h +@@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse) + #endif + } + ++#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE ++static inline int ppc_get_pmu_inuse(void) ++{ ++ return get_paca()->pmcregs_in_use; ++} ++#endif ++ + extern void power4_enable_pmcs(void); + + #else /* CONFIG_PPC64 */ +diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c +index 15e2ba93678e..938b0948478f 100644 +--- a/arch/powerpc/kvm/book3s_hv.c ++++ b/arch/powerpc/kvm/book3s_hv.c +@@ -59,6 +59,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -3891,6 +3892,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + ++#ifdef CONFIG_PPC_PSERIES ++ if (kvmhv_on_pseries()) { ++ barrier(); ++ if (vcpu->arch.vpa.pinned_addr) { ++ struct lppaca *lp = vcpu->arch.vpa.pinned_addr; ++ get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; ++ } else { ++ get_lppaca()->pmcregs_in_use = 1; ++ } ++ barrier(); ++ } ++#endif + kvmhv_load_guest_pmu(vcpu); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); +@@ -4025,6 +4038,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, + save_pmu |= nesting_enabled(vcpu->kvm); + + kvmhv_save_guest_pmu(vcpu, save_pmu); ++#ifdef CONFIG_PPC_PSERIES ++ if (kvmhv_on_pseries()) { ++ barrier(); ++ get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); ++ barrier(); ++ } ++#endif + + vc->entry_exit_map = 0x101; + vc->in_guest = 0; +-- +2.31.1 + diff --git a/patches.suse/KVM-PPC-Book3S-HV-Nested-Sanitise-H_ENTER_NESTED-TM-.patch b/patches.suse/KVM-PPC-Book3S-HV-Nested-Sanitise-H_ENTER_NESTED-TM-.patch new file mode 100644 index 0000000..221fa27 --- /dev/null +++ b/patches.suse/KVM-PPC-Book3S-HV-Nested-Sanitise-H_ENTER_NESTED-TM-.patch @@ -0,0 +1,82 @@ +From d9c57d3ed52a92536f5fa59dc5ccdd58b4875076 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Thu, 8 Jul 2021 21:26:22 +1000 +Subject: [PATCH] KVM: PPC: Book3S HV Nested: Sanitise H_ENTER_NESTED TM state + +References: bsc#1156395 +Patch-mainline: v5.14-rc3 +Git-commit: d9c57d3ed52a92536f5fa59dc5ccdd58b4875076 + +The H_ENTER_NESTED hypercall is handled by the L0, and it is a request +by the L1 to switch the context of the vCPU over to that of its L2 +guest, and return with an interrupt indication. The L1 is responsible +for switching some registers to guest context, and the L0 switches +others (including all the hypervisor privileged state). + +If the L2 MSR has TM active, then the L1 is responsible for +recheckpointing the L2 TM state. Then the L1 exits to L0 via the +H_ENTER_NESTED hcall, and the L0 saves the TM state as part of the exit, +and then it recheckpoints the TM state as part of the nested entry and +finally HRFIDs into the L2 with TM active MSR. Not efficient, but about +the simplest approach for something that's horrendously complicated. + +Problems arise if the L1 exits to the L0 with a TM state which does not +match the L2 TM state being requested. For example if the L1 is +transactional but the L2 MSR is non-transactional, or vice versa. The +L0's HRFID can take a TM Bad Thing interrupt and crash. + +Fix this by disallowing H_ENTER_NESTED in TM[T] state entirely, and then +ensuring that if the L1 is suspended then the L2 must have TM active, +and if the L1 is not suspended then the L2 must not have TM active. + +Fixes: 360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall") +Cc: stable@vger.kernel.org # v4.20+ +Reported-by: Alexey Kardashevskiy +Acked-by: Michael Neuling +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Acked-by: Michal Suchanek +--- + arch/powerpc/kvm/book3s_hv_nested.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c +--- a/arch/powerpc/kvm/book3s_hv_nested.c ++++ b/arch/powerpc/kvm/book3s_hv_nested.c +@@ -302,6 +302,9 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) + if (vcpu->kvm->arch.l1_ptcr == 0) + return H_NOT_AVAILABLE; + ++ if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) ++ return H_BAD_MODE; ++ + /* copy parameters in */ + hv_ptr = kvmppc_get_gpr(vcpu, 4); + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, +@@ -322,6 +325,23 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) + if (l2_hv.vcpu_token >= NR_CPUS) + return H_PARAMETER; + ++ /* ++ * L1 must have set up a suspended state to enter the L2 in a ++ * transactional state, and only in that case. These have to be ++ * filtered out here to prevent causing a TM Bad Thing in the ++ * host HRFID. We could synthesize a TM Bad Thing back to the L1 ++ * here but there doesn't seem like much point. ++ */ ++ if (MSR_TM_SUSPENDED(vcpu->arch.shregs.msr)) { ++ if (!MSR_TM_ACTIVE(l2_regs.msr)) ++ return H_BAD_MODE; ++ } else { ++ if (l2_regs.msr & MSR_TS_MASK) ++ return H_BAD_MODE; ++ if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_TS_MASK)) ++ return H_BAD_MODE; ++ } ++ + /* translate lpid */ + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); + if (!l2) +-- +2.31.1 + diff --git a/patches.suse/KVM-PPC-Book3S-HV-Save-host-FSCR-in-the-P7-8-path.patch b/patches.suse/KVM-PPC-Book3S-HV-Save-host-FSCR-in-the-P7-8-path.patch new file mode 100644 index 0000000..d37024a --- /dev/null +++ b/patches.suse/KVM-PPC-Book3S-HV-Save-host-FSCR-in-the-P7-8-path.patch @@ -0,0 +1,77 @@ +From 1438709e6328925ef496dafd467dbd0353137434 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Wed, 26 May 2021 22:58:51 +1000 +Subject: [PATCH] KVM: PPC: Book3S HV: Save host FSCR in the P7/8 path + +References: bsc#1065729 +Patch-mainline: v5.13-rc5 +Git-commit: 1438709e6328925ef496dafd467dbd0353137434 +Alt-commit: 6ba53317d497dec029bfb040b1daf38328fa00ab + +Similar to commit 25edcc50d76c ("KVM: PPC: Book3S HV: Save and restore +FSCR in the P9 path"), ensure the P7/8 path saves and restores the host +FSCR. The logic explained in that patch actually applies there to the +old path well: a context switch can be made before kvmppc_vcpu_run_hv +restores the host FSCR and returns. + +Now both the p9 and the p7/8 paths now save and restore their FSCR, it +no longer needs to be restored at the end of kvmppc_vcpu_run_hv + +Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs") +Cc: stable@vger.kernel.org # v3.14+ +Signed-off-by: Nicholas Piggin +Reviewed-by: Fabiano Rosas +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210526125851.3436735-1-npiggin@gmail.com +Acked-by: Michal Suchanek +--- + arch/powerpc/kvm/book3s_hv.c | 1 - + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 7 +++++++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c +index 28a80d240b76..13728495ac66 100644 +--- a/arch/powerpc/kvm/book3s_hv.c ++++ b/arch/powerpc/kvm/book3s_hv.c +@@ -4455,7 +4455,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) + mtspr(SPRN_EBBRR, ebb_regs[1]); + mtspr(SPRN_BESCR, ebb_regs[2]); + mtspr(SPRN_TAR, user_tar); +- mtspr(SPRN_FSCR, current->thread.fscr); + } + mtspr(SPRN_VRSAVE, user_vrsave); + +diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S ++++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +@@ -56,6 +56,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #define STACK_SLOT_HFSCR (SFS-72) + #define STACK_SLOT_AMR (SFS-80) + #define STACK_SLOT_UAMOR (SFS-88) ++#define STACK_SLOT_FSCR (SFS-96) + /* the following is used by the P9 short path */ + #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ + +@@ -711,6 +712,8 @@ BEGIN_FTR_SECTION + std r6, STACK_SLOT_DAWR(r1) + std r7, STACK_SLOT_DAWRX(r1) + std r8, STACK_SLOT_IAMR(r1) ++ mfspr r5, SPRN_FSCR ++ std r5, STACK_SLOT_FSCR(r1) + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + mfspr r5, SPRN_AMR +@@ -1624,6 +1627,10 @@ FTR_SECTION_ELSE + ld r7, STACK_SLOT_HFSCR(r1) + mtspr SPRN_HFSCR, r7 + ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ++BEGIN_FTR_SECTION ++ ld r5, STACK_SLOT_FSCR(r1) ++ mtspr SPRN_FSCR, r5 ++END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* + * Restore various registers to 0, where non-zero values + * set by the guest could disrupt the host. +-- +2.31.1 + diff --git a/patches.suse/KVM-PPC-Book3S-HV-Tolerate-treclaim.-in-fake-suspend.patch b/patches.suse/KVM-PPC-Book3S-HV-Tolerate-treclaim.-in-fake-suspend.patch new file mode 100644 index 0000000..286488b --- /dev/null +++ b/patches.suse/KVM-PPC-Book3S-HV-Tolerate-treclaim.-in-fake-suspend.patch @@ -0,0 +1,102 @@ +From 267cdfa21385d78c794768233678756e32b39ead Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Wed, 8 Sep 2021 20:17:18 +1000 +Subject: [PATCH] KVM: PPC: Book3S HV: Tolerate treclaim. in fake-suspend mode + changing registers + +References: bsc#1156395 +Patch-mainline: v5.15-rc2 +Git-commit: 267cdfa21385d78c794768233678756e32b39ead + +POWER9 DD2.2 and 2.3 hardware implements a "fake-suspend" mode where +certain TM instructions executed in HV=0 mode cause softpatch interrupts +so the hypervisor can emulate them and prevent problematic processor +conditions. In this fake-suspend mode, the treclaim. instruction does +not modify registers. + +Unfortunately the rfscv instruction executed by the guest do not +generate softpatch interrupts, which can cause the hypervisor to lose +track of the fake-suspend mode, and it can execute this treclaim. while +not in fake-suspend mode. This modifies GPRs and crashes the hypervisor. + +It's not trivial to disable scv in the guest with HFSCR now, because +they assume a POWER9 has scv available. So this fix saves and restores +checkpointed registers across the treclaim. + +Fixes: 7854f7545bff ("KVM: PPC: Book3S: Rework TM save/restore code and make it C-callable") +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210908101718.118522-2-npiggin@gmail.com +Acked-by: Michal Suchanek +--- + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 36 +++++++++++++++++++++++-- + 1 file changed, 34 insertions(+), 2 deletions(-) + +diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +index 75079397c2a5..90484425a1e6 100644 +--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S ++++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +@@ -2536,7 +2536,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST) + /* The following code handles the fake_suspend = 1 case */ + mflr r0 + std r0, PPC_LR_STKOFF(r1) +- stdu r1, -PPC_MIN_STKFRM(r1) ++ stdu r1, -TM_FRAME_SIZE(r1) + + /* Turn on TM. */ + mfmsr r8 +@@ -2551,10 +2551,42 @@ BEGIN_FTR_SECTION + END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) + nop + ++ /* ++ * It's possible that treclaim. may modify registers, if we have lost ++ * track of fake-suspend state in the guest due to it using rfscv. ++ * Save and restore registers in case this occurs. ++ */ ++ mfspr r3, SPRN_DSCR ++ mfspr r4, SPRN_XER ++ mfspr r5, SPRN_AMR ++ /* SPRN_TAR would need to be saved here if the kernel ever used it */ ++ mfcr r12 ++ SAVE_NVGPRS(r1) ++ SAVE_GPR(2, r1) ++ SAVE_GPR(3, r1) ++ SAVE_GPR(4, r1) ++ SAVE_GPR(5, r1) ++ stw r12, 8(r1) ++ std r1, HSTATE_HOST_R1(r13) ++ + /* We have to treclaim here because that's the only way to do S->N */ + li r3, TM_CAUSE_KVM_RESCHED + TRECLAIM(R3) + ++ GET_PACA(r13) ++ ld r1, HSTATE_HOST_R1(r13) ++ REST_GPR(2, r1) ++ REST_GPR(3, r1) ++ REST_GPR(4, r1) ++ REST_GPR(5, r1) ++ lwz r12, 8(r1) ++ REST_NVGPRS(r1) ++ mtspr SPRN_DSCR, r3 ++ mtspr SPRN_XER, r4 ++ mtspr SPRN_AMR, r5 ++ mtcr r12 ++ HMT_MEDIUM ++ + /* + * We were in fake suspend, so we are not going to save the + * register state as the guest checkpointed state (since +@@ -2582,7 +2614,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + +- addi r1, r1, PPC_MIN_STKFRM ++ addi r1, r1, TM_FRAME_SIZE + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr +-- +2.31.1 + diff --git a/patches.suse/KVM-PPC-Fix-clearing-never-mapped-TCEs-in-realmode.patch b/patches.suse/KVM-PPC-Fix-clearing-never-mapped-TCEs-in-realmode.patch new file mode 100644 index 0000000..30caebc --- /dev/null +++ b/patches.suse/KVM-PPC-Fix-clearing-never-mapped-TCEs-in-realmode.patch @@ -0,0 +1,69 @@ +From 1d78dfde33a02da1d816279c2e3452978b7abd39 Mon Sep 17 00:00:00 2001 +From: Alexey Kardashevskiy +Date: Fri, 27 Aug 2021 14:07:06 +1000 +Subject: [PATCH] KVM: PPC: Fix clearing never mapped TCEs in realmode + +References: bsc#1156395 +Patch-mainline: v5.15-rc1 +Git-commit: 1d78dfde33a02da1d816279c2e3452978b7abd39 + +Since commit e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on +demand too"), pages for TCE tables for KVM guests are allocated only +when needed. This allows skipping any update when clearing TCEs. This +works mostly fine as TCE updates are handled when the MMU is enabled. +The realmode handlers fail with H_TOO_HARD when pages are not yet +allocated, except when clearing a TCE in which case KVM prints a warning +and proceeds to dereference a NULL pointer, which crashes the host OS. + +This has not been caught so far as the change in commit e1a1ef84cd07 is +reasonably new, and POWER9 runs mostly radix which does not use realmode +handlers. With hash, the default TCE table is memset() by QEMU when the +machine is reset which triggers page faults and the KVM TCE device's +kvm_spapr_tce_fault() handles those with MMU on. And the huge DMA +windows are not cleared by VMs which instead successfully create a DMA +window big enough to map the VM memory 1:1 and then VMs just map +everything without clearing. + +This started crashing now as commit 381ceda88c4c ("powerpc/pseries/iommu: +Make use of DDW for indirect mapping") added a mode when a dymanic DMA +window not big enough to map the VM memory 1:1 but it is used anyway, +and the VM now is the first (i.e. not QEMU) to clear a just created +table. Note that upstream QEMU needs to be modified to trigger the VM to +trigger the host OS crash. + +This replaces WARN_ON_ONCE_RM() with a check and return, and adds +another warning if TCE is not being cleared. + +Fixes: e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too") +Signed-off-by: Alexey Kardashevskiy +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210827040706.517652-1-aik@ozlabs.ru +Acked-by: Michal Suchanek +--- + arch/powerpc/kvm/book3s_64_vio_hv.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c +index dc6591548f0c..636c6ae0939b 100644 +--- a/arch/powerpc/kvm/book3s_64_vio_hv.c ++++ b/arch/powerpc/kvm/book3s_64_vio_hv.c +@@ -173,10 +173,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, + idx -= stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + /* +- * page must not be NULL in real mode, +- * kvmppc_rm_ioba_validate() must have taken care of this. ++ * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is ++ * being cleared, otherwise it returns H_TOO_HARD and we skip this. + */ +- WARN_ON_ONCE_RM(!page); ++ if (!page) { ++ WARN_ON_ONCE_RM(tce != 0); ++ return; ++ } + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +-- +2.31.1 + diff --git a/patches.suse/KVM-PPC-Fix-kvm_arch_vcpu_ioctl-vcpu_load-leak.patch b/patches.suse/KVM-PPC-Fix-kvm_arch_vcpu_ioctl-vcpu_load-leak.patch new file mode 100644 index 0000000..d02c446 --- /dev/null +++ b/patches.suse/KVM-PPC-Fix-kvm_arch_vcpu_ioctl-vcpu_load-leak.patch @@ -0,0 +1,51 @@ +From bc4188a2f56e821ea057aca6bf444e138d06c252 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin +Date: Fri, 16 Jul 2021 12:43:10 +1000 +Subject: [PATCH] KVM: PPC: Fix kvm_arch_vcpu_ioctl vcpu_load leak + +References: bsc#1156395 +Patch-mainline: v5.14-rc3 +Git-commit: bc4188a2f56e821ea057aca6bf444e138d06c252 + +vcpu_put is not called if the user copy fails. This can result in preempt +notifier corruption and crashes, among other issues. + +Fixes: b3cebfe8c1ca ("KVM: PPC: Move vcpu_load/vcpu_put down to each ioctl case in kvm_arch_vcpu_ioctl") +Reported-by: Alexey Kardashevskiy +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210716024310.164448-2-npiggin@gmail.com +Acked-by: Michal Suchanek +--- + arch/powerpc/kvm/powerpc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c +index be33b5321a76..b4e6f70b97b9 100644 +--- a/arch/powerpc/kvm/powerpc.c ++++ b/arch/powerpc/kvm/powerpc.c +@@ -2048,9 +2048,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, + { + struct kvm_enable_cap cap; + r = -EFAULT; +- vcpu_load(vcpu); + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; ++ vcpu_load(vcpu); + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + vcpu_put(vcpu); + break; +@@ -2074,9 +2074,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; +- vcpu_load(vcpu); + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; ++ vcpu_load(vcpu); + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + vcpu_put(vcpu); + break; +-- +2.31.1 + diff --git a/patches.suse/netfilter-conntrack-collect-all-entries-in-one-cycle.patch b/patches.suse/netfilter-conntrack-collect-all-entries-in-one-cycle.patch new file mode 100644 index 0000000..b88b6cd --- /dev/null +++ b/patches.suse/netfilter-conntrack-collect-all-entries-in-one-cycle.patch @@ -0,0 +1,179 @@ +From: Florian Westphal +Date: Tue, 27 Jul 2021 00:29:19 +0200 +Subject: netfilter: conntrack: collect all entries in one cycle +Patch-mainline: v5.14-rc6 +Git-commit: 4608fdfc07e116f9fc0895beb40abad7cdb5ee3d +References: bsc#1173604 + +Michal Kubecek reports that conntrack gc is responsible for frequent +wakeups (every 125ms) on idle systems. + +On busy systems, timed out entries are evicted during lookup. +The gc worker is only needed to remove entries after system becomes idle +after a busy period. + +To resolve this, always scan the entire table. +If the scan is taking too long, reschedule so other work_structs can run +and resume from next bucket. + +After a completed scan, wait for 2 minutes before the next cycle. +Heuristics for faster re-schedule are removed. + +GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow +tuning this as-needed or even turn the gc worker off. + +Reported-by: Michal Kubecek +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Thomas Abraham +Acked-by: Michal Kubecek + +--- + net/netfilter/nf_conntrack_core.c | 71 ++++++++++--------------------- + 1 file changed, 22 insertions(+), 49 deletions(-) + +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -66,22 +66,17 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); + + struct conntrack_gc_work { + struct delayed_work dwork; +- u32 last_bucket; ++ u32 next_bucket; + bool exiting; + bool early_drop; +- long next_gc_run; + }; + + static __read_mostly struct kmem_cache *nf_conntrack_cachep; + static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); + static __read_mostly bool nf_conntrack_locks_all; + +-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ +-#define GC_MAX_BUCKETS_DIV 128u +-/* upper bound of full table scan */ +-#define GC_MAX_SCAN_JIFFIES (16u * HZ) +-/* desired ratio of entries found to be expired */ +-#define GC_EVICT_RATIO 50u ++#define GC_SCAN_INTERVAL (120u * HZ) ++#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) + + static struct conntrack_gc_work conntrack_gc_work; + +@@ -1220,17 +1215,13 @@ static void nf_ct_offload_timeout(struct nf_conn *ct) + + static void gc_worker(struct work_struct *work) + { +- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); +- unsigned int i, goal, buckets = 0, expired_count = 0; +- unsigned int nf_conntrack_max95 = 0; ++ unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; ++ unsigned int i, hashsz, nf_conntrack_max95 = 0; ++ unsigned long next_run = GC_SCAN_INTERVAL; + struct conntrack_gc_work *gc_work; +- unsigned int ratio, scanned = 0; +- unsigned long next_run; +- + gc_work = container_of(work, struct conntrack_gc_work, dwork.work); + +- goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; +- i = gc_work->last_bucket; ++ i = gc_work->next_bucket; + if (gc_work->early_drop) + nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; + +@@ -1238,22 +1229,21 @@ static void gc_worker(struct work_struct *work) + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + struct hlist_nulls_node *n; +- unsigned int hashsz; + struct nf_conn *tmp; + +- i++; + rcu_read_lock(); + + nf_conntrack_get_ht(&ct_hash, &hashsz); +- if (i >= hashsz) +- i = 0; ++ if (i >= hashsz) { ++ rcu_read_unlock(); ++ break; ++ } + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + struct net *net; + + tmp = nf_ct_tuplehash_to_ctrack(h); + +- scanned++; + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { + nf_ct_offload_timeout(tmp); + continue; +@@ -1261,7 +1251,6 @@ static void gc_worker(struct work_struct *work) + + if (nf_ct_is_expired(tmp)) { + nf_ct_gc_expired(tmp); +- expired_count++; + continue; + } + +@@ -1293,7 +1282,14 @@ static void gc_worker(struct work_struct *work) + */ + rcu_read_unlock(); + cond_resched(); +- } while (++buckets < goal); ++ i++; ++ ++ if (time_after(jiffies, end_time) && i < hashsz) { ++ gc_work->next_bucket = i; ++ next_run = 0; ++ break; ++ } ++ } while (i < hashsz); + + if (gc_work->exiting) + return; +@@ -1304,40 +1300,17 @@ static void gc_worker(struct work_struct *work) + * + * This worker is only here to reap expired entries when system went + * idle after a busy period. +- * +- * The heuristics below are supposed to balance conflicting goals: +- * +- * 1. Minimize time until we notice a stale entry +- * 2. Maximize scan intervals to not waste cycles +- * +- * Normally, expire ratio will be close to 0. +- * +- * As soon as a sizeable fraction of the entries have expired +- * increase scan frequency. + */ +- ratio = scanned ? expired_count * 100 / scanned : 0; +- if (ratio > GC_EVICT_RATIO) { +- gc_work->next_gc_run = min_interval; +- } else { +- unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; +- +- BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); +- +- gc_work->next_gc_run += min_interval; +- if (gc_work->next_gc_run > max) +- gc_work->next_gc_run = max; ++ if (next_run) { ++ gc_work->early_drop = false; ++ gc_work->next_bucket = 0; + } +- +- next_run = gc_work->next_gc_run; +- gc_work->last_bucket = i; +- gc_work->early_drop = false; + queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); + } + + static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) + { + INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); +- gc_work->next_gc_run = HZ; + gc_work->exiting = false; + } + diff --git a/patches.suse/powerpc-xive-Discard-disabled-interrupts-in-get_irqc.patch b/patches.suse/powerpc-xive-Discard-disabled-interrupts-in-get_irqc.patch new file mode 100644 index 0000000..d95027e --- /dev/null +++ b/patches.suse/powerpc-xive-Discard-disabled-interrupts-in-get_irqc.patch @@ -0,0 +1,54 @@ +From 6f779e1d359b8d5801f677c1d49dcfa10bf95674 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Mon, 11 Oct 2021 09:02:03 +0200 +Subject: [PATCH] powerpc/xive: Discard disabled interrupts in + get_irqchip_state() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +References: fate#322438 bsc#1085030 git-fixes +Patch-mainline: v5.15-rc6 +Git-commit: 6f779e1d359b8d5801f677c1d49dcfa10bf95674 + +When an interrupt is passed through, the KVM XIVE device calls the +set_vcpu_affinity() handler which raises the P bit to mask the +interrupt and to catch any in-flight interrupts while routing the +interrupt to the guest. + +On the guest side, drivers (like some Intels) can request at probe +time some MSIs and call synchronize_irq() to check that there are no +in flight interrupts. This will call the XIVE get_irqchip_state() +handler which will always return true as the interrupt P bit has been +set on the host side and lock the CPU in an infinite loop. + +Fix that by discarding disabled interrupts in get_irqchip_state(). + +Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race") +Cc: stable@vger.kernel.org #v5.4+ +Signed-off-by: Cédric Le Goater +Tested-by: seeteena +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20211011070203.99726-1-clg@kaod.org +Acked-by: Michal Suchanek +--- + arch/powerpc/sysdev/xive/common.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c +index c732ce5a3e1a..c5d75c02ad8b 100644 +--- a/arch/powerpc/sysdev/xive/common.c ++++ b/arch/powerpc/sysdev/xive/common.c +@@ -945,7 +945,8 @@ static int xive_get_irqchip_state(struct irq_data *data, + * interrupt to be inactive in that case. + */ + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && +- (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); ++ (xd->saved_p || (!!(pq & XIVE_ESB_VAL_P) && ++ !irqd_irq_disabled(data))); + return 0; + default: + return -EINVAL; +-- +2.31.1 + diff --git a/patches.suse/x86-pat-pass-valid-address-to-sanitize_phys.patch b/patches.suse/x86-pat-pass-valid-address-to-sanitize_phys.patch new file mode 100644 index 0000000..6e99067 --- /dev/null +++ b/patches.suse/x86-pat-pass-valid-address-to-sanitize_phys.patch @@ -0,0 +1,56 @@ +From: Jeff Moyer +Date: Wed, 11 Aug 2021 17:07:37 -0400 +Subject: x86/pat: Pass valid address to sanitize_phys() +Git-commit: aeef8b5089b76852bd84889f2809e69a7cfb414e +Patch-mainline: v5.15-rc2 +References: bsc#1152489 + +The end address passed to memtype_reserve() is handed directly to +sanitize_phys(). However, end is exclusive and sanitize_phys() expects +an inclusive address. If end falls at the end of the physical address +space, sanitize_phys() will return 0. This can result in drivers +failing to load, and the following warning: + + WARNING: CPU: 26 PID: 749 at arch/x86/mm/pat.c:354 reserve_memtype+0x262/0x450 + reserve_memtype failed: [mem 0x3ffffff00000-0xffffffffffffffff], req uncached-minus + Call Trace: + [] reserve_memtype+0x262/0x450 + [] ioremap_nocache+0x1a/0x20 + [] mpt3sas_base_map_resources+0x151/0xa60 [mpt3sas] + [] mpt3sas_base_attach+0xf5/0xa50 [mpt3sas] + ---[ end trace 6d6eea4438db89ef ]--- + ioremap reserve_memtype failed -22 + mpt3sas_cm0: unable to map adapter memory! or resource not found + mpt3sas_cm0: failure at drivers/scsi/mpt3sas/mpt3sas_scsih.c:10597/_scsih_probe()! + +Fix this by passing the inclusive end address to sanitize_phys(). + +Fixes: 510ee090abc3 ("x86/mm/pat: Prepare {reserve, free}_memtype() for "decoy" addresses") +Signed-off-by: Jeff Moyer +Signed-off-by: Thomas Gleixner +Reviewed-by: David Hildenbrand +Reviewed-by: Dan Williams +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/x49o8a3pu5i.fsf@segfault.boston.devel.redhat.com + +Acked-by: Borislav Petkov +--- + arch/x86/mm/pat.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/pat.c ++++ b/arch/x86/mm/pat.c +@@ -551,7 +551,12 @@ int reserve_memtype(u64 start, u64 end, + int err = 0; + + start = sanitize_phys(start); +- end = sanitize_phys(end); ++ ++ /* ++ * The end address passed into this function is exclusive, but ++ * sanitize_phys() expects an inclusive address. ++ */ ++ end = sanitize_phys(end - 1) + 1; + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); diff --git a/series.conf b/series.conf index f2b6db5..e2e75d5 100644 --- a/series.conf +++ b/series.conf @@ -50100,6 +50100,7 @@ patches.suse/x86-fault-don-t-send-sigsegv-twice-on-segv_pkuerr.patch patches.suse/x86-sev-check-sme-sev-support-in-cpuid-first patches.suse/powerpc-kprobes-Fix-validation-of-prefixed-instructi.patch + patches.suse/KVM-PPC-Book3S-HV-Save-host-FSCR-in-the-P7-8-path.patch patches.suse/bus-ti-sysc-Fix-flakey-idling-of-uarts-and-stop-usin.patch patches.suse/ext4-fix-bug-on-in-ext4_es_cache_extent-as-ext4_spli.patch patches.suse/ext4-fix-memory-leak-in-ext4_fill_super.patch @@ -51052,7 +51053,9 @@ patches.suse/efi-tpm-Differentiate-missing-and-invalid-final-even.patch patches.suse/firmware-efi-Tell-memblock-about-EFI-iomem-reservati.patch patches.suse/timers-Fix-get_next_timer_interrupt-with-no-timers-p.patch + patches.suse/KVM-PPC-Fix-kvm_arch_vcpu_ioctl-vcpu_load-leak.patch patches.suse/KVM-PPC-Book3S-Fix-H_RTAS-rets-buffer-overflow.patch + patches.suse/KVM-PPC-Book3S-HV-Nested-Sanitise-H_ENTER_NESTED-TM-.patch patches.suse/workqueue-fix-UAF-in-pwq_unbound_release_workfn.patch patches.suse/cgroup1-fix-leaked-context-root-causing-sporadic-NULL-deref-in-LTP.patch patches.suse/RDMA-bnxt_re-Fix-stats-counters.patch @@ -51183,6 +51186,7 @@ patches.suse/drm-meson-fix-colour-distortion-from-HDR-set-during-.patch patches.suse/ceph-reduce-contention-in-ceph_check_delayed_caps.patch patches.suse/ceph-take-snap_empty_lock-atomically-with-snaprealm-refcount-change.patch + patches.suse/netfilter-conntrack-collect-all-entries-in-one-cycle.patch patches.suse/bpf-Fix-integer-overflow-involving-bucket_size.patch patches.suse/net-ethernet-ti-cpsw-fix-min-eth-packet-size-for-non.patch patches.suse/bareudp-Fix-invalid-read-beyond-skb-s-linear-data.patch @@ -51598,10 +51602,12 @@ patches.suse/powerpc-perf-Drop-the-case-of-returning-0-as-instruc.patch patches.suse/powerpc-perf-Fix-the-check-for-SIAR-value.patch patches.suse/KVM-PPC-Book3S-HV-Fix-copy_tofrom_guest-routines.patch + patches.suse/KVM-PPC-Book3S-HV-Nested-Reflect-guest-PMU-in-use-to.patch patches.suse/powerpc-smp-Fix-a-crash-while-booting-kvm-guest-with.patch patches.suse/powerpc-smp-Update-cpu_core_map-on-all-PowerPc-syste.patch patches.suse/powerpc-smp-Enable-CACHE-domain-for-shared-processor.patch patches.suse/powerpc-numa-Update-cpu_cpu_map-on-CPU-online-offlin.patch + patches.suse/KVM-PPC-Fix-clearing-never-mapped-TCEs-in-realmode.patch patches.suse/NFSv4-pNFS-Fix-a-layoutget-livelock-loop.patch patches.suse/SUNRPC-Fix-potential-memory-corruption.patch patches.suse/SUNRPC-Simplify-socket-shutdown-when-not-reusing-TCP.patch @@ -51673,6 +51679,8 @@ patches.suse/nvme-avoid-race-in-shutdown-namespace-removal.patch patches.suse/PCI-Add-AMD-GPU-multi-function-power-dependencies.patch patches.suse/spi-Fix-tegra20-build-with-CONFIG_PM-n.patch + patches.suse/KVM-PPC-Book3S-HV-Tolerate-treclaim.-in-fake-suspend.patch + patches.suse/x86-pat-pass-valid-address-to-sanitize_phys.patch patches.suse/x86-mm-fix-kern_addr_valid-to-cope-with-existing-but-not-present-entries.patch patches.suse/drm-nouveau-nvkm-Replace-ENOSYS-with-ENODEV.patch patches.suse/enetc-Fix-uninitialized-struct-dim_sample-field-usag.patch @@ -51826,6 +51834,7 @@ patches.suse/iio-light-opt3001-Fixed-timeout-error-when-0-lux.patch patches.suse/cb710-avoid-NULL-pointer-subtraction.patch patches.suse/mei-me-add-Ice-Lake-N-device-id.patch + patches.suse/powerpc-xive-Discard-disabled-interrupts-in-get_irqc.patch patches.suse/KVM-PPC-Book3S-HV-Fix-stack-handling-in-idle_kvm_sta.patch patches.suse/KVM-PPC-Book3S-HV-Make-idle_kvm_start_guest-return-0.patch patches.suse/virtio-write-back-F_VERSION_1-before-validate.patch