Blob Blame History Raw
From: Jan Kara <jack@suse.cz>
Subject: blk-wbt: Fix missed wakeup
References: bsc#1186627
Patch-mainline: Never, upstream has reworked the code

When multiple __wbt_wait() calls race with end of IO handled by wbt_rqw_done()
a missed wakeup (and consequently infinite hang in __wbt_wait()) can happen
like:

CPU1 (waiter1)		CPU2 (waiter2)		CPU3 (waker)
__wbt_wait()		__wbt_wait
  blocks in io_schedule()
			  has_sleeper = wq_has_sleeper();
			    -> true as waiter1 is already queued
					        __wbt_done()
						  wbt_rqw_done()
						    wakes up waiter1
  ... wakes up
  if (data.got_token)
    break; (got_token set by wakeup function)
blk_mq_get_request()
  - returns NULL
if (unlikely(!rq)) {
  __wbt_done();
  ...
			  prepare_to_wait_exclusive(...);
			    goes to sleep as has_sleeper is true

Now there's nobody to wake up waiter2 and all further process entering
__wbt_wait() will go to sleep as well behind waiter2.

Fix the problem by updating has_sleeper when adding ourselves to the wait
queue based on current situation. That way forward progress is guaranteed at
least for one process.

Signed-off-by: Jan Kara <jack@suse.cz>

---
 block/blk-wbt.c      |    3 ++-
 include/linux/wait.h |    1 +
 kernel/sched/wait.c  |   14 ++++++++++++--
 3 files changed, 15 insertions(+), 3 deletions(-)

--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -603,7 +603,8 @@ static void __wbt_wait(struct rq_wb *rwb
 			atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)))
 		return;
 
-	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+	has_sleeper = !prepare_to_wait_exclusive_first(&rqw->wait, &data.wq,
+						       TASK_UNINTERRUPTIBLE);
 	do {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (data.got_token)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -982,6 +982,7 @@ do {										\
  */
 void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+bool prepare_to_wait_exclusive_first(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -239,17 +239,27 @@ prepare_to_wait(struct wait_queue_head *
 }
 EXPORT_SYMBOL(prepare_to_wait);
 
-void
-prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+bool
+prepare_to_wait_exclusive_first(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
+	bool ret;
 
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&wq_head->lock, flags);
+	ret = list_empty(&wq_head->head);
 	if (list_empty(&wq_entry->entry))
 		__add_wait_queue_entry_tail(wq_head, wq_entry);
 	set_current_state(state);
 	spin_unlock_irqrestore(&wq_head->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive_first);
+
+void
+prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+	prepare_to_wait_exclusive_first(wq_head, wq_entry, state);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);