diff --git a/patches.suse/workqueue-Print-backtraces-from-CPUs-with-hung-CPU-b.patch b/patches.suse/workqueue-Print-backtraces-from-CPUs-with-hung-CPU-b.patch new file mode 100644 index 0000000..7d72e05 --- /dev/null +++ b/patches.suse/workqueue-Print-backtraces-from-CPUs-with-hung-CPU-b.patch @@ -0,0 +1,165 @@ +From cd2440d66fec7d1bdb4f605b64c27c63c9141989 Mon Sep 17 00:00:00 2001 +From: Petr Mladek +Date: Tue, 7 Mar 2023 13:53:35 +0100 +Subject: [PATCH] workqueue: Print backtraces from CPUs with hung CPU bound + workqueues +Git-commit: cd2440d66fec7d1bdb4f605b64c27c63c9141989 +Patch-mainline: v6.4-rc1 +References: bsc#1211044 + +The workqueue watchdog reports a lockup when there was not any progress +in the worker pool for a long time. The progress means that a pending +work item starts being proceed. + +Worker pools for unbound workqueues always wake up an idle worker and +try to process the work immediately. The last idle worker has to create +new worker first. The stall might happen only when a new worker could +not be created in which case an error should get printed. Another problem +might be too high load. In this case, workers are victims of a global +system problem. + +Worker pools for CPU bound workqueues are designed for lightweight +work items that do not need much CPU time. They are proceed one by +one on a single worker. New worker is used only when a work is sleeping. +It creates one additional scenario. The stall might happen when +the CPU-bound workqueue is used for CPU-intensive work. + +More precisely, the stall is detected when a CPU-bound worker is in +the TASK_RUNNING state for too long. In this case, it might be useful +to see the backtrace from the problematic worker. + +The information how long a worker is in the running state is not available. +But the CPU-bound worker pools do not have many workers in the running +state by definition. And only few pools are typically blocked. + +It should be acceptable to print backtraces from all workers in +TASK_RUNNING state in the stalled worker pools. The number of false +positives should be very low. + +Signed-off-by: Petr Mladek +Signed-off-by: Tejun Heo + +--- + kernel/workqueue.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 57 insertions(+) + +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -48,6 +48,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -140,6 +141,8 @@ enum { + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * + * MD: wq_mayday_lock protected. ++ * ++ * WD: Used internally by the watchdog. + */ + + /* struct worker is defined in workqueue_internal.h */ +@@ -152,6 +155,7 @@ struct worker_pool { + unsigned int flags; /* X: flags */ + + unsigned long watchdog_ts; /* L: watchdog timestamp */ ++ bool cpu_stall; /* WD: stalled cpu bound pool */ + + struct list_head worklist; /* L: list of pending works */ + int nr_workers; /* L: total number of workers */ +@@ -5493,6 +5497,48 @@ static struct timer_list wq_watchdog_tim + static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; + static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + ++/* ++ * Show workers that might prevent the processing of pending work items. ++ * The only candidates are CPU-bound workers in the running state. ++ * Pending work items should be handled by another idle worker ++ * in all other situations. ++ */ ++static void show_cpu_pool_hog(struct worker_pool *pool) ++{ ++ struct worker *worker; ++ unsigned long flags; ++ int bkt; ++ ++ spin_lock_irqsave(&pool->lock, flags); ++ ++ hash_for_each(pool->busy_hash, bkt, worker, hentry) { ++ if (worker->task->state == TASK_RUNNING) { ++ pr_info("pool %d:\n", pool->id); ++ sched_show_task(worker->task); ++ } ++ } ++ ++ spin_unlock_irqrestore(&pool->lock, flags); ++} ++ ++static void show_cpu_pools_hogs(void) ++{ ++ struct worker_pool *pool; ++ int pi; ++ ++ pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); ++ ++ rcu_read_lock(); ++ ++ for_each_pool(pool, pi) { ++ if (pool->cpu_stall) ++ show_cpu_pool_hog(pool); ++ ++ } ++ ++ rcu_read_unlock(); ++} ++ + static void wq_watchdog_reset_touched(void) + { + int cpu; +@@ -5506,6 +5552,7 @@ static void wq_watchdog_timer_fn(unsigne + { + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; ++ bool cpu_pool_stall = false; + unsigned long now = jiffies; + struct worker_pool *pool; + int pi; +@@ -5518,6 +5565,7 @@ static void wq_watchdog_timer_fn(unsigne + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + ++ pool->cpu_stall = false; + if (list_empty(&pool->worklist)) + continue; + +@@ -5547,11 +5595,17 @@ static void wq_watchdog_timer_fn(unsigne + /* did we stall? */ + if (time_after(now, ts + thresh)) { + lockup_detected = true; ++ if (pool->cpu >= 0) { ++ pool->cpu_stall = true; ++ cpu_pool_stall = true; ++ } + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(now - pool_ts) / 1000); + } ++ ++ + } + + rcu_read_unlock(); +@@ -5559,6 +5613,9 @@ static void wq_watchdog_timer_fn(unsigne + if (lockup_detected) + show_workqueue_state(); + ++ if (cpu_pool_stall) ++ show_cpu_pools_hogs(); ++ + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); + } diff --git a/series.conf b/series.conf index 946ca81..ca0d2fa 100644 --- a/series.conf +++ b/series.conf @@ -63309,6 +63309,7 @@ patches.suse/workqueue-Warn-when-a-new-worker-could-not-be-create.patch patches.suse/workqueue-Interrupted-create_worker-is-not-a-repeate.patch patches.suse/workqueue-Warn-when-a-rescuer-could-not-be-created.patch + patches.suse/workqueue-Print-backtraces-from-CPUs-with-hung-CPU-b.patch patches.suse/xfs-verify-buffer-contents-when-we-skip-log-replay.patch # dhowells/linux-fs keys-uefi