From: Valentin Schneider <vschneid@redhat.com>
Date: Thu, 30 Jun 2022 23:32:58 +0100
Subject: panic, kexec: make __crash_kexec() NMI safe
Git-commit: 05c6257433b7212f07a7e53479a8ab038fc1666a
Patch-mainline: v6.1-rc1
References: git-fixes
Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI
panic() doesn't work. The cause of that lies in the PREEMPT_RT definition
of mutex_trylock():
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
return 0;
This prevents an nmi_panic() from executing the main body of
__crash_kexec() which does the actual kexec into the kdump kernel. The
warning and return are explained by:
6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context")
[...]
The reasons for this are:
1) There is a potential deadlock in the slowpath
2) Another cpu which blocks on the rtmutex will boost the task
which allegedly locked the rtmutex, but that cannot work
because the hard/softirq context borrows the task context.
Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex
and replace it with an atomic variable. This is somewhat overzealous as
*some* callsites could keep using a mutex (e.g. the sysfs-facing ones
like crash_shrink_memory()), but this has the benefit of involving a
single unified lock and preventing any future NMI-related surprises.
Tested by triggering NMI panics via:
$ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi
$ echo 1 > /proc/sys/kernel/unknown_nmi_panic
$ echo 1 > /proc/sys/kernel/panic
$ ipmitool power diag
Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com
Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context")
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ ptesarik: One hunk repeated for compat because of missing
upstream commit 4b692e861619353ce069e547a67c8d0e32d9ef3d ]
Signed-off-by: Petr Tesarik <ptesarik@suse.com>
---
kernel/kexec.c | 28 ++++++++++++----------------
kernel/kexec_core.c | 20 ++++++++++----------
kernel/kexec_file.c | 4 ++--
kernel/kexec_internal.h | 15 ++++++++++++++-
4 files changed, 38 insertions(+), 29 deletions(-)
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -248,19 +248,17 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon
return -EINVAL;
/* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
+ * region when loading crash kernels we need a serialization
+ * here to prevent multiple crash kernels from attempting to
+ * load simultaneously, and to prevent a crash kernel from
+ * loading over the top of a in use crash kernel.
*/
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
result = do_kexec_load(entry, nr_segments, segments, flags);
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return result;
}
@@ -302,19 +300,17 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compa
}
/* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
+ * region when loading crash kernels we need a serialization
+ * here to prevent multiple crash kernels from attempting to
+ * load simultaneously, and to prevent a crash kernel from
+ * loading over the top of a in use crash kernel.
*/
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
result = do_kexec_load(entry, nr_segments, ksegments, flags);
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return result;
}
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -46,7 +46,7 @@
#include <crypto/hash.h>
#include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex);
+atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -939,7 +939,7 @@ int kexec_load_disabled;
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
- /* Take the kexec_mutex here to prevent sys_kexec_load
+ /* Take the kexec_lock here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -947,7 +947,7 @@ void __noclone __crash_kexec(struct pt_r
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_trylock()) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
@@ -956,7 +956,7 @@ void __noclone __crash_kexec(struct pt_r
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
@@ -988,13 +988,13 @@ ssize_t crash_get_memory_size(void)
{
ssize_t size = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (crashk_res.end != crashk_res.start)
size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return size;
}
@@ -1005,7 +1005,7 @@ int crash_shrink_memory(unsigned long ne
unsigned long old_size;
struct resource *ram_res;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (kexec_crash_image) {
@@ -1044,7 +1044,7 @@ int crash_shrink_memory(unsigned long ne
insert_resource(&iomem_resource, ram_res);
unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return ret;
}
@@ -1116,7 +1116,7 @@ int kernel_kexec(void)
{
int error = 0;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
@@ -1192,6 +1192,6 @@ int kernel_kexec(void)
#endif
Unlock:
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
return error;
}
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -335,7 +335,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, ke
image = NULL;
- if (!mutex_trylock(&kexec_mutex))
+ if (!kexec_trylock())
return -EBUSY;
dest_image = &kexec_image;
@@ -407,7 +407,7 @@ out:
if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex);
+ kexec_unlock();
kimage_free(image);
return ret;
}
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *ima
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
-extern struct mutex kexec_mutex;
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0;
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>