|
Mel Gorman |
41b622 |
From d9ae4243d5abc7c0ca9fb44a073f8d50450f435c Mon Sep 17 00:00:00 2001
|
|
Mel Gorman |
41b622 |
From: Jens Axboe <axboe@kernel.dk>
|
|
Mel Gorman |
41b622 |
Date: Thu, 28 Sep 2017 11:31:55 -0600
|
|
Mel Gorman |
41b622 |
Subject: [PATCH] writeback: only allow one inflight and pending full flush
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
References: bnc#1081213
|
|
Benjamin Poirier |
2528c7 |
Patch-mainline: v4.15-rc1
|
|
Mel Gorman |
41b622 |
Git-commit: aac8d41cd438f25bf3110fc6b98f1d16d7dbc169
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
When someone calls wakeup_flusher_threads() or
|
|
Mel Gorman |
41b622 |
wakeup_flusher_threads_bdi(), they schedule writeback of all dirty
|
|
Mel Gorman |
41b622 |
pages in the system (or on that bdi). If we are tight on memory, we
|
|
Mel Gorman |
41b622 |
can get tons of these queued from kswapd/vmscan. This causes (at
|
|
Mel Gorman |
41b622 |
least) two problems:
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
1) We consume a ton of memory just allocating writeback work items.
|
|
Mel Gorman |
41b622 |
We've seen as much as 600 million of these writeback work items
|
|
Mel Gorman |
41b622 |
pending. That's a lot of memory to pointlessly hold hostage,
|
|
Mel Gorman |
41b622 |
while the box is under memory pressure.
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
2) We spend so much time processing these work items, that we
|
|
Mel Gorman |
41b622 |
introduce a softlockup in writeback processing. This is because
|
|
Mel Gorman |
41b622 |
each of the writeback work items don't end up doing any work (it's
|
|
Mel Gorman |
41b622 |
hard when you have millions of identical ones coming in to the
|
|
Mel Gorman |
41b622 |
flush machinery), so we just sit in a tight loop pulling work
|
|
Mel Gorman |
41b622 |
items and deleting/freeing them.
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
Fix this by adding a 'start_all' bit to the writeback structure, and
|
|
Mel Gorman |
41b622 |
set that when someone attempts to flush all dirty pages. The bit is
|
|
Mel Gorman |
41b622 |
cleared when we start writeback on that work item. If the bit is
|
|
Mel Gorman |
41b622 |
already set when we attempt to queue !nr_pages writeback, then we
|
|
Mel Gorman |
41b622 |
simply ignore it.
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
This provides us one full flush in flight, with one pending as well,
|
|
Mel Gorman |
41b622 |
and makes for more efficient handling of this type of writeback.
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
|
|
Mel Gorman |
41b622 |
Tested-by: Chris Mason <clm@fb.com>
|
|
Mel Gorman |
41b622 |
Reviewed-by: Jan Kara <jack@suse.cz>
|
|
Mel Gorman |
41b622 |
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Mel Gorman |
41b622 |
Signed-off-by: Mel Gorman <mgorman@suse.de>
|
|
Mel Gorman |
41b622 |
---
|
|
Mel Gorman |
41b622 |
fs/fs-writeback.c | 25 +++++++++++++++++++++++++
|
|
Mel Gorman |
41b622 |
include/linux/backing-dev-defs.h | 1 +
|
|
Mel Gorman |
41b622 |
2 files changed, 26 insertions(+)
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
|
|
Mel Gorman |
41b622 |
index e8799b0b9727..b33ca14d8531 100644
|
|
Mel Gorman |
41b622 |
--- a/fs/fs-writeback.c
|
|
Mel Gorman |
41b622 |
+++ b/fs/fs-writeback.c
|
|
Mel Gorman |
41b622 |
@@ -53,6 +53,7 @@ struct wb_writeback_work {
|
|
Mel Gorman |
41b622 |
unsigned int for_background:1;
|
|
Mel Gorman |
41b622 |
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
|
|
Mel Gorman |
41b622 |
unsigned int auto_free:1; /* free on completion */
|
|
Mel Gorman |
41b622 |
+ unsigned int start_all:1; /* nr_pages == 0 (all) writeback */
|
|
Mel Gorman |
41b622 |
enum wb_reason reason; /* why was writeback initiated? */
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
struct list_head list; /* pending work list */
|
|
Mel Gorman |
41b622 |
@@ -951,6 +952,20 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
|
|
Mel Gorman |
41b622 |
if (!wb_has_dirty_io(wb))
|
|
Mel Gorman |
41b622 |
return;
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
+ /*
|
|
Mel Gorman |
41b622 |
+ * All callers of this function want to start writeback of all
|
|
Mel Gorman |
41b622 |
+ * dirty pages. Places like vmscan can call this at a very
|
|
Mel Gorman |
41b622 |
+ * high frequency, causing pointless allocations of tons of
|
|
Mel Gorman |
41b622 |
+ * work items and keeping the flusher threads busy retrieving
|
|
Mel Gorman |
41b622 |
+ * that work. Ensure that we only allow one of them pending and
|
|
Mel Gorman |
41b622 |
+ * inflight at the time. It doesn't matter if we race a little
|
|
Mel Gorman |
41b622 |
+ * bit on this, so use the faster separate test/set bit variants.
|
|
Mel Gorman |
41b622 |
+ */
|
|
Mel Gorman |
41b622 |
+ if (test_bit(WB_start_all, &wb->state))
|
|
Mel Gorman |
41b622 |
+ return;
|
|
Mel Gorman |
41b622 |
+
|
|
Mel Gorman |
41b622 |
+ set_bit(WB_start_all, &wb->state);
|
|
Mel Gorman |
41b622 |
+
|
|
Mel Gorman |
41b622 |
/*
|
|
Mel Gorman |
41b622 |
* This is WB_SYNC_NONE writeback, so if allocation fails just
|
|
Mel Gorman |
41b622 |
* wakeup the thread for old dirty data writeback
|
|
Mel Gorman |
41b622 |
@@ -958,6 +973,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
|
|
Mel Gorman |
41b622 |
work = kzalloc(sizeof(*work),
|
|
Mel Gorman |
41b622 |
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
|
Mel Gorman |
41b622 |
if (!work) {
|
|
Mel Gorman |
41b622 |
+ clear_bit(WB_start_all, &wb->state);
|
|
Mel Gorman |
41b622 |
trace_writeback_nowork(wb);
|
|
Mel Gorman |
41b622 |
wb_wakeup(wb);
|
|
Mel Gorman |
41b622 |
return;
|
|
Mel Gorman |
41b622 |
@@ -968,6 +984,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
|
|
Mel Gorman |
41b622 |
work->range_cyclic = 1;
|
|
Mel Gorman |
41b622 |
work->reason = reason;
|
|
Mel Gorman |
41b622 |
work->auto_free = 1;
|
|
Mel Gorman |
41b622 |
+ work->start_all = 1;
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
wb_queue_work(wb, work);
|
|
Mel Gorman |
41b622 |
}
|
|
Mel Gorman |
41b622 |
@@ -1821,6 +1838,14 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
|
|
Mel Gorman |
41b622 |
list_del_init(&work->list);
|
|
Mel Gorman |
41b622 |
}
|
|
Mel Gorman |
41b622 |
spin_unlock_bh(&wb->work_lock);
|
|
Mel Gorman |
41b622 |
+
|
|
Mel Gorman |
41b622 |
+ /*
|
|
Mel Gorman |
41b622 |
+ * Once we start processing a work item that had !nr_pages,
|
|
Mel Gorman |
41b622 |
+ * clear the wb state bit for that so we can allow more.
|
|
Mel Gorman |
41b622 |
+ */
|
|
Mel Gorman |
41b622 |
+ if (work && work->start_all)
|
|
Mel Gorman |
41b622 |
+ clear_bit(WB_start_all, &wb->state);
|
|
Mel Gorman |
41b622 |
+
|
|
Mel Gorman |
41b622 |
return work;
|
|
Mel Gorman |
41b622 |
}
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
|
|
Mel Gorman |
41b622 |
index 866c433e7d32..420de5c7c7f9 100644
|
|
Mel Gorman |
41b622 |
--- a/include/linux/backing-dev-defs.h
|
|
Mel Gorman |
41b622 |
+++ b/include/linux/backing-dev-defs.h
|
|
Mel Gorman |
41b622 |
@@ -24,6 +24,7 @@ enum wb_state {
|
|
Mel Gorman |
41b622 |
WB_shutting_down, /* wb_shutdown() in progress */
|
|
Mel Gorman |
41b622 |
WB_writeback_running, /* Writeback is in progress */
|
|
Mel Gorman |
41b622 |
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
|
|
Mel Gorman |
41b622 |
+ WB_start_all, /* nr_pages == 0 (all) work pending */
|
|
Mel Gorman |
41b622 |
};
|
|
Mel Gorman |
41b622 |
|
|
Mel Gorman |
41b622 |
enum wb_congested_state {
|