|
Davidlohr Bueso |
17bb75 |
From a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 Mon Sep 17 00:00:00 2001
|
|
Davidlohr Bueso |
17bb75 |
From: Waiman Long <longman@redhat.com>
|
|
Davidlohr Bueso |
17bb75 |
Date: Sun, 28 Apr 2019 17:25:38 -0400
|
|
Davidlohr Bueso |
17bb75 |
Subject: [PATCH] locking/rwsem: Prevent decrement of reader count before increment
|
|
Davidlohr Bueso |
17bb75 |
Git-commit: a9e9bcb45b1525ba7aea26ed9441e8632aeeda58
|
|
Davidlohr Bueso |
17bb75 |
Patch-mainline: v5.2-rc1
|
|
Davidlohr Bueso |
17bb75 |
References: bsc#1050549
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
During my rwsem testing, it was found that after a down_read(), the
|
|
Davidlohr Bueso |
17bb75 |
reader count may occasionally become 0 or even negative. Consequently,
|
|
Davidlohr Bueso |
17bb75 |
a writer may steal the lock at that time and execute with the reader
|
|
Davidlohr Bueso |
17bb75 |
in parallel thus breaking the mutual exclusion guarantee of the write
|
|
Davidlohr Bueso |
17bb75 |
lock. In other words, both readers and writer can become rwsem owners
|
|
Davidlohr Bueso |
17bb75 |
simultaneously.
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
The current reader wakeup code does it in one pass to clear waiter->task
|
|
Davidlohr Bueso |
17bb75 |
and put them into wake_q before fully incrementing the reader count.
|
|
Davidlohr Bueso |
17bb75 |
Once waiter->task is cleared, the corresponding reader may see it,
|
|
Davidlohr Bueso |
17bb75 |
finish the critical section and do unlock to decrement the count before
|
|
Davidlohr Bueso |
17bb75 |
the count is incremented. This is not a problem if there is only one
|
|
Davidlohr Bueso |
17bb75 |
reader to wake up as the count has been pre-incremented by 1. It is
|
|
Davidlohr Bueso |
17bb75 |
a problem if there are more than one readers to be woken up and writer
|
|
Davidlohr Bueso |
17bb75 |
can steal the lock.
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
The wakeup was actually done in 2 passes before the following v4.9 commit:
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
To fix this problem, the wakeup is now done in two passes
|
|
Davidlohr Bueso |
17bb75 |
again. In the first pass, we collect the readers and count them.
|
|
Davidlohr Bueso |
17bb75 |
The reader count is then fully incremented. In the second pass, the
|
|
Davidlohr Bueso |
17bb75 |
waiter->task is then cleared and they are put into wake_q to be woken
|
|
Davidlohr Bueso |
17bb75 |
up later.
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
Signed-off-by: Waiman Long <longman@redhat.com>
|
|
Davidlohr Bueso |
17bb75 |
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Borislav Petkov <bp@alien8.de>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Davidlohr Bueso <dave@stgolabs.net>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Thomas Gleixner <tglx@linutronix.de>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Tim Chen <tim.c.chen@linux.intel.com>
|
|
Davidlohr Bueso |
17bb75 |
Cc: Will Deacon <will.deacon@arm.com>
|
|
Davidlohr Bueso |
17bb75 |
Cc: huang ying <huang.ying.caritas@gmail.com>
|
|
Davidlohr Bueso |
17bb75 |
Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")
|
|
Davidlohr Bueso |
17bb75 |
Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com
|
|
Davidlohr Bueso |
17bb75 |
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
Davidlohr Bueso |
17bb75 |
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
---
|
|
Davidlohr Bueso |
17bb75 |
kernel/locking/rwsem-xadd.c | 44 ++++++++++++++++++++++++++++++--------------
|
|
Davidlohr Bueso |
17bb75 |
1 file changed, 30 insertions(+), 14 deletions(-)
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
|
|
Davidlohr Bueso |
17bb75 |
index 34c70a985ca9..7d29e15d8f44 100644
|
|
Davidlohr Bueso |
17bb75 |
--- a/kernel/locking/rwsem-xadd.c
|
|
Davidlohr Bueso |
17bb75 |
+++ b/kernel/locking/rwsem-xadd.c
|
|
Davidlohr Bueso |
17bb75 |
@@ -129,6 +129,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
|
|
Davidlohr Bueso |
17bb75 |
{
|
|
Davidlohr Bueso |
17bb75 |
struct rwsem_waiter *waiter, *tmp;
|
|
Davidlohr Bueso |
17bb75 |
long oldcount, woken = 0, adjustment = 0;
|
|
Davidlohr Bueso |
17bb75 |
+ struct list_head wlist;
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
/*
|
|
Davidlohr Bueso |
17bb75 |
* Take a peek at the queue head waiter such that we can determine
|
|
Davidlohr Bueso |
17bb75 |
@@ -187,18 +188,42 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
|
|
Davidlohr Bueso |
17bb75 |
* of the queue. We know that woken will be at least 1 as we accounted
|
|
Davidlohr Bueso |
17bb75 |
* for above. Note we increment the 'active part' of the count by the
|
|
Davidlohr Bueso |
17bb75 |
* number of readers before waking any processes up.
|
|
Davidlohr Bueso |
17bb75 |
+ *
|
|
Davidlohr Bueso |
17bb75 |
+ * We have to do wakeup in 2 passes to prevent the possibility that
|
|
Davidlohr Bueso |
17bb75 |
+ * the reader count may be decremented before it is incremented. It
|
|
Davidlohr Bueso |
17bb75 |
+ * is because the to-be-woken waiter may not have slept yet. So it
|
|
Davidlohr Bueso |
17bb75 |
+ * may see waiter->task got cleared, finish its critical section and
|
|
Davidlohr Bueso |
17bb75 |
+ * do an unlock before the reader count increment.
|
|
Davidlohr Bueso |
17bb75 |
+ *
|
|
Davidlohr Bueso |
17bb75 |
+ * 1) Collect the read-waiters in a separate list, count them and
|
|
Davidlohr Bueso |
17bb75 |
+ * fully increment the reader count in rwsem.
|
|
Davidlohr Bueso |
17bb75 |
+ * 2) For each waiters in the new list, clear waiter->task and
|
|
Davidlohr Bueso |
17bb75 |
+ * put them into wake_q to be woken up later.
|
|
Davidlohr Bueso |
17bb75 |
*/
|
|
Davidlohr Bueso |
17bb75 |
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
|
|
Davidlohr Bueso |
17bb75 |
- struct task_struct *tsk;
|
|
Davidlohr Bueso |
17bb75 |
-
|
|
Davidlohr Bueso |
17bb75 |
+ list_for_each_entry(waiter, &sem->wait_list, list) {
|
|
Davidlohr Bueso |
17bb75 |
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
|
|
Davidlohr Bueso |
17bb75 |
break;
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
woken++;
|
|
Davidlohr Bueso |
17bb75 |
- tsk = waiter->task;
|
|
Davidlohr Bueso |
17bb75 |
+ }
|
|
Davidlohr Bueso |
17bb75 |
+ list_cut_before(&wlist, &sem->wait_list, &waiter->list);
|
|
Davidlohr Bueso |
17bb75 |
+
|
|
Davidlohr Bueso |
17bb75 |
+ adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
|
|
Davidlohr Bueso |
17bb75 |
+ if (list_empty(&sem->wait_list)) {
|
|
Davidlohr Bueso |
17bb75 |
+ /* hit end of list above */
|
|
Davidlohr Bueso |
17bb75 |
+ adjustment -= RWSEM_WAITING_BIAS;
|
|
Davidlohr Bueso |
17bb75 |
+ }
|
|
Davidlohr Bueso |
17bb75 |
+
|
|
Davidlohr Bueso |
17bb75 |
+ if (adjustment)
|
|
Davidlohr Bueso |
17bb75 |
+ atomic_long_add(adjustment, &sem->count);
|
|
Davidlohr Bueso |
17bb75 |
+
|
|
Davidlohr Bueso |
17bb75 |
+ /* 2nd pass */
|
|
Davidlohr Bueso |
17bb75 |
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
|
|
Davidlohr Bueso |
17bb75 |
+ struct task_struct *tsk;
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
+ tsk = waiter->task;
|
|
Davidlohr Bueso |
17bb75 |
get_task_struct(tsk);
|
|
Davidlohr Bueso |
17bb75 |
- list_del(&waiter->list);
|
|
Davidlohr Bueso |
17bb75 |
+
|
|
Davidlohr Bueso |
17bb75 |
/*
|
|
Davidlohr Bueso |
17bb75 |
* Ensure calling get_task_struct() before setting the reader
|
|
Davidlohr Bueso |
17bb75 |
* waiter to nil such that rwsem_down_read_failed() cannot
|
|
Davidlohr Bueso |
17bb75 |
@@ -212,15 +237,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
|
|
Davidlohr Bueso |
17bb75 |
*/
|
|
Davidlohr Bueso |
17bb75 |
wake_q_add_safe(wake_q, tsk);
|
|
Davidlohr Bueso |
17bb75 |
}
|
|
Davidlohr Bueso |
17bb75 |
-
|
|
Davidlohr Bueso |
17bb75 |
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
|
|
Davidlohr Bueso |
17bb75 |
- if (list_empty(&sem->wait_list)) {
|
|
Davidlohr Bueso |
17bb75 |
- /* hit end of list above */
|
|
Davidlohr Bueso |
17bb75 |
- adjustment -= RWSEM_WAITING_BIAS;
|
|
Davidlohr Bueso |
17bb75 |
- }
|
|
Davidlohr Bueso |
17bb75 |
-
|
|
Davidlohr Bueso |
17bb75 |
- if (adjustment)
|
|
Davidlohr Bueso |
17bb75 |
- atomic_long_add(adjustment, &sem->count);
|
|
Davidlohr Bueso |
17bb75 |
}
|
|
Davidlohr Bueso |
17bb75 |
|
|
Davidlohr Bueso |
17bb75 |
/*
|
|
Davidlohr Bueso |
17bb75 |
--
|
|
Davidlohr Bueso |
17bb75 |
2.16.4
|
|
Davidlohr Bueso |
17bb75 |
|