From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 18 Jun 2021 13:54:09 +0200
Subject: x86/sev: Split up runtime #VC handler for correct state tracking
Git-commit: be1a5408868af341f61f93c191b5e346ee88c82a
Patch-mainline: v5.14-rc1
References: jsc#SLE-14337
Split up the #VC handler code into a from-user and a from-kernel part.
This allows clean and correct state tracking, as the #VC handler needs
to enter NMI-state when raised from kernel mode and plain IRQ state when
raised from user-mode.
Fixes: 62441a1fb532 ("x86/sev-es: Correctly track IRQ states in runtime #VC handler")
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210618115409.22735-3-joro@8bytes.org
---
arch/x86/entry/entry_64.S | 4 -
arch/x86/include/asm/traps.h | 2
arch/x86/kernel/sev-es.c | 116 ++++++++++++++++++++++---------------------
3 files changed, 66 insertions(+), 56 deletions(-)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -557,7 +557,7 @@ ENTRY(\asmsym)
movq %rsp, %rdi /* pt_regs pointer */
- call \cfunc
+ call kernel_\cfunc
/*
* No need to switch back to the IST stack. The current stack is either
@@ -576,7 +576,7 @@ ENTRY(\asmsym)
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- call safe_stack_\cfunc
+ call user_\cfunc
jmp error_exit
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -86,6 +86,8 @@ asmlinkage __visible notrace
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
void __init trap_init(void);
asmlinkage __visible struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
+asmlinkage void kernel_vmm_communication(struct pt_regs *regs, unsigned long error_code);
+asmlinkage void user_vmm_communication(struct pt_regs *regs, unsigned long error_code);
#endif
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -761,7 +761,7 @@ void __init sev_es_init_vc_handling(void
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
- initial_vc_handler = (unsigned long)safe_stack_vmm_communication;
+ initial_vc_handler = (unsigned long)kernel_vmm_communication;
}
static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
@@ -1199,11 +1199,6 @@ static enum es_result vc_handle_trap_ac(
return ES_EXCEPTION;
}
-static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
-{
- do_debug(regs, 0);
-}
-
static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
@@ -1299,31 +1294,13 @@ static __always_inline bool on_vc_fallba
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
-/*
- * Main #VC exception handler. It is called when the entry code was able to
- * switch off the IST to a safe kernel stack.
- *
- * With the current implementation it is always possible to switch to a safe
- * stack because #VC exceptions only happen at known places, like intercepted
- * instructions or accesses to MMIO areas/IO ports. They can also happen with
- * code instrumentation when the hypervisor intercepts #DB, but the critical
- * paths are forbidden to be instrumented, so #DB exceptions currently also
- * only happen in safe places.
- */
-void safe_stack_vmm_communication(struct pt_regs *regs, unsigned long error_code)
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
-
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
- vc_handle_trap_db(regs);
- return;
- }
+ bool ret = true;
/*
* This is invoked through an interrupt gate, so IRQs are disabled. The
@@ -1349,15 +1326,18 @@ void safe_stack_vmm_communication(struct
case ES_UNSUPPORTED:
pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
@@ -1373,21 +1353,45 @@ void safe_stack_vmm_communication(struct
BUG();
}
-out:
- return;
+ return ret;
+}
-fail:
- if (user_mode(regs)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- } else {
- pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
- result);
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
+
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+void kernel_vmm_communication(struct pt_regs *regs, unsigned long error_code)
+{
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(on_vc_fallback_stack(regs))) {
+ panic("Can't handle #VC exception from unsupported context\n");
+ }
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ do_debug(regs, 0);
+ return;
+ }
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
/* Show some debug info */
show_regs(regs);
@@ -1397,22 +1401,26 @@ fail:
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
}
-
- goto out;
}
-/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
-void ist_vmm_communication(struct pt_regs *regs, unsigned long error_code)
+void user_vmm_communication(struct pt_regs *regs, unsigned long error_code)
{
- panic("Can't handle #VC exception from unsupported context\n");
-}
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ do_debug(regs, 0);
+ return;
+ }
-void vmm_communication(struct pt_regs *regs, unsigned long error_code)
-{
- if (likely(!on_vc_fallback_stack(regs)))
- safe_stack_vmm_communication(regs, error_code);
- else
- ist_vmm_communication(regs, error_code);
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
}
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)