Blob Blame History Raw
From f8bb5cae9616224a39cbb399de382d36ac41df10 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 1 Feb 2021 00:05:46 +0100
Subject: [PATCH] rcu/nocb: Trigger self-IPI on late deferred wake up before
 user resume
Git-commit: f8bb5cae9616224a39cbb399de382d36ac41df10
Patch-mainline: v5.12-rc1
References: git-fixes

Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
kthread (rcuog) to be serviced.

Unfortunately the call to rcu_user_enter() is already past the last
rescheduling opportunity before we resume to userspace or to guest mode.
We may escape there with the woken task ignored.

The ultimate resort to fix every callsites is to trigger a self-IPI
(nohz_full depends on arch to implement arch_irq_work_raise()) that will
trigger a reschedule on IRQ tail or guest exit.

Eventually every site that want a saner treatment will need to carefully
place a call to rcu_nocb_flush_deferred_wakeup() before the last explicit
need_resched() check upon resume.

Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and perf)
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210131230548.32970-4-frederic@kernel.org
Signed-off-by: Frederic Weisbecker <fweisbecker@suse.com>
---
 kernel/rcu/tree.c        |   22 +++++++++++++++++++++-
 kernel/rcu/tree.h        |    2 +-
 kernel/rcu/tree_plugin.h |   23 +++++++++++++++--------
 3 files changed, 37 insertions(+), 10 deletions(-)

--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -597,6 +597,19 @@ void rcu_idle_enter(void)
 }
 
 #ifdef CONFIG_NO_HZ_FULL
+
+/*
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace resume.
+ */
+static void late_wakeup_func(struct irq_work *work)
+{
+}
+
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = {
+	.func = late_wakeup_func,
+};
+
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
  *
@@ -613,7 +626,14 @@ void rcu_user_enter(void)
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
 	lockdep_assert_irqs_disabled();
-	do_nocb_deferred_wakeup(rdp);
+	/*
+	 * We may be past the last rescheduling opportunity in the entry code.
+	 * Trigger a self IPI that will fire and reschedule once we resume to
+	 * user/guest mode.
+	 */
+	if (do_nocb_deferred_wakeup(rdp) && need_resched())
+		irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+
 	rcu_eqs_enter(true);
 }
 #endif /* CONFIG_NO_HZ_FULL */
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -429,7 +429,7 @@ static bool rcu_nocb_adopt_orphan_cbs(st
 				      struct rcu_data *rdp,
 				      unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_cpu_nocb_kthread(int cpu);
 static void __init rcu_spawn_nocb_kthreads(void);
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1543,7 +1543,7 @@ bool rcu_is_nocb_cpu(int cpu)
  * Kick the leader kthread for this NOCB group.  Caller holds ->nocb_lock
  * and this function releases it.
  */
-static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+static bool __wake_nocb_leader(struct rcu_data *rdp, bool force,
 			       unsigned long flags)
 	__releases(rdp->nocb_lock)
 {
@@ -1552,7 +1552,7 @@ static void __wake_nocb_leader(struct rc
 	lockdep_assert_held(&rdp->nocb_lock);
 	if (!READ_ONCE(rdp_leader->nocb_kthread)) {
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-		return;
+		return false;
 	}
 	if (rdp_leader->nocb_leader_sleep || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
@@ -1561,8 +1561,10 @@ static void __wake_nocb_leader(struct rc
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
 		swake_up_one(&rdp_leader->nocb_wq);
+		return true;
 	} else {
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+		return false;
 	}
 }
 
@@ -1985,20 +1987,23 @@ static int rcu_nocb_need_deferred_wakeup
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 {
 	unsigned long flags;
 	int ndw;
+	int ret;
 
 	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
 	if (!rcu_nocb_need_deferred_wakeup(rdp)) {
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-		return;
+		return false;
 	}
 	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
 	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-	__wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+	ret = __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+	return ret;
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@ -2014,10 +2019,11 @@ static void do_nocb_deferred_wakeup_time
  * This means we do an inexact common-case check.  Note that if
  * we miss, ->nocb_timer will eventually clean things up.
  */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
 	if (rcu_nocb_need_deferred_wakeup(rdp))
-		do_nocb_deferred_wakeup_common(rdp);
+		return do_nocb_deferred_wakeup_common(rdp);
+	return false;
 }
 
 void rcu_nocb_flush_deferred_wakeup(void)
@@ -2276,8 +2282,9 @@ static int rcu_nocb_need_deferred_wakeup
 	return false;
 }
 
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
+	return false;
 }
 
 static void rcu_spawn_cpu_nocb_kthread(int cpu)