Blob Blame History Raw
From cd2440d66fec7d1bdb4f605b64c27c63c9141989 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Tue, 7 Mar 2023 13:53:35 +0100
Subject: [PATCH] workqueue: Print backtraces from CPUs with hung CPU bound
 workqueues
Git-commit: cd2440d66fec7d1bdb4f605b64c27c63c9141989
Patch-mainline: v6.4-rc1
References: bsc#1211044

The workqueue watchdog reports a lockup when there was not any progress
in the worker pool for a long time. The progress means that a pending
work item starts being proceed.

Worker pools for unbound workqueues always wake up an idle worker and
try to process the work immediately. The last idle worker has to create
new worker first. The stall might happen only when a new worker could
not be created in which case an error should get printed. Another problem
might be too high load. In this case, workers are victims of a global
system problem.

Worker pools for CPU bound workqueues are designed for lightweight
work items that do not need much CPU time. They are proceed one by
one on a single worker. New worker is used only when a work is sleeping.
It creates one additional scenario. The stall might happen when
the CPU-bound workqueue is used for CPU-intensive work.

More precisely, the stall is detected when a CPU-bound worker is in
the TASK_RUNNING state for too long. In this case, it might be useful
to see the backtrace from the problematic worker.

The information how long a worker is in the running state is not available.
But the CPU-bound worker pools do not have many workers in the running
state by definition. And only few pools are typically blocked.

It should be acceptable to print backtraces from all workers in
TASK_RUNNING state in the stalled worker pools. The number of false
positives should be very low.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

---
 kernel/workqueue.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/kvm_para.h>
 
@@ -140,6 +141,8 @@ enum {
  * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
  *
  * MD: wq_mayday_lock protected.
+ *
+ * WD: Used internally by the watchdog.
  */
 
 /* struct worker is defined in workqueue_internal.h */
@@ -152,6 +155,7 @@ struct worker_pool {
 	unsigned int		flags;		/* X: flags */
 
 	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
+	bool			cpu_stall;	/* WD: stalled cpu bound pool */
 
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
@@ -5493,6 +5497,48 @@ static struct timer_list wq_watchdog_tim
 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
 
+/*
+ * Show workers that might prevent the processing of pending work items.
+ * The only candidates are CPU-bound workers in the running state.
+ * Pending work items should be handled by another idle worker
+ * in all other situations.
+ */
+static void show_cpu_pool_hog(struct worker_pool *pool)
+{
+	struct worker *worker;
+	unsigned long flags;
+	int bkt;
+
+	spin_lock_irqsave(&pool->lock, flags);
+
+	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+		if (worker->task->state == TASK_RUNNING) {
+			pr_info("pool %d:\n", pool->id);
+			sched_show_task(worker->task);
+		}
+	}
+
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void show_cpu_pools_hogs(void)
+{
+	struct worker_pool *pool;
+	int pi;
+
+	pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
+
+	rcu_read_lock();
+
+	for_each_pool(pool, pi) {
+		if (pool->cpu_stall)
+			show_cpu_pool_hog(pool);
+
+	}
+
+	rcu_read_unlock();
+}
+
 static void wq_watchdog_reset_touched(void)
 {
 	int cpu;
@@ -5506,6 +5552,7 @@ static void wq_watchdog_timer_fn(unsigne
 {
 	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
 	bool lockup_detected = false;
+	bool cpu_pool_stall = false;
 	unsigned long now = jiffies;
 	struct worker_pool *pool;
 	int pi;
@@ -5518,6 +5565,7 @@ static void wq_watchdog_timer_fn(unsigne
 	for_each_pool(pool, pi) {
 		unsigned long pool_ts, touched, ts;
 
+		pool->cpu_stall = false;
 		if (list_empty(&pool->worklist))
 			continue;
 
@@ -5547,11 +5595,17 @@ static void wq_watchdog_timer_fn(unsigne
 		/* did we stall? */
 		if (time_after(now, ts + thresh)) {
 			lockup_detected = true;
+			if (pool->cpu >= 0) {
+				pool->cpu_stall = true;
+				cpu_pool_stall = true;
+			}
 			pr_emerg("BUG: workqueue lockup - pool");
 			pr_cont_pool_info(pool);
 			pr_cont(" stuck for %us!\n",
 				jiffies_to_msecs(now - pool_ts) / 1000);
 		}
+
+
 	}
 
 	rcu_read_unlock();
@@ -5559,6 +5613,9 @@ static void wq_watchdog_timer_fn(unsigne
 	if (lockup_detected)
 		show_workqueue_state();
 
+	if (cpu_pool_stall)
+		show_cpu_pools_hogs();
+
 	wq_watchdog_reset_touched();
 	mod_timer(&wq_watchdog_timer, jiffies + thresh);
 }