Mel Gorman 41b622
From d9ae4243d5abc7c0ca9fb44a073f8d50450f435c Mon Sep 17 00:00:00 2001
Mel Gorman 41b622
From: Jens Axboe <axboe@kernel.dk>
Mel Gorman 41b622
Date: Thu, 28 Sep 2017 11:31:55 -0600
Mel Gorman 41b622
Subject: [PATCH] writeback: only allow one inflight and pending full flush
Mel Gorman 41b622
Mel Gorman 41b622
References: bnc#1081213
Benjamin Poirier 2528c7
Patch-mainline: v4.15-rc1
Mel Gorman 41b622
Git-commit: aac8d41cd438f25bf3110fc6b98f1d16d7dbc169
Mel Gorman 41b622
Mel Gorman 41b622
When someone calls wakeup_flusher_threads() or
Mel Gorman 41b622
wakeup_flusher_threads_bdi(), they schedule writeback of all dirty
Mel Gorman 41b622
pages in the system (or on that bdi). If we are tight on memory, we
Mel Gorman 41b622
can get tons of these queued from kswapd/vmscan. This causes (at
Mel Gorman 41b622
least) two problems:
Mel Gorman 41b622
Mel Gorman 41b622
1) We consume a ton of memory just allocating writeback work items.
Mel Gorman 41b622
   We've seen as much as 600 million of these writeback work items
Mel Gorman 41b622
   pending. That's a lot of memory to pointlessly hold hostage,
Mel Gorman 41b622
   while the box is under memory pressure.
Mel Gorman 41b622
Mel Gorman 41b622
2) We spend so much time processing these work items, that we
Mel Gorman 41b622
   introduce a softlockup in writeback processing. This is because
Mel Gorman 41b622
   each of the writeback work items don't end up doing any work (it's
Mel Gorman 41b622
   hard when you have millions of identical ones coming in to the
Mel Gorman 41b622
   flush machinery), so we just sit in a tight loop pulling work
Mel Gorman 41b622
   items and deleting/freeing them.
Mel Gorman 41b622
Mel Gorman 41b622
Fix this by adding a 'start_all' bit to the writeback structure, and
Mel Gorman 41b622
set that when someone attempts to flush all dirty pages. The bit is
Mel Gorman 41b622
cleared when we start writeback on that work item. If the bit is
Mel Gorman 41b622
already set when we attempt to queue !nr_pages writeback, then we
Mel Gorman 41b622
simply ignore it.
Mel Gorman 41b622
Mel Gorman 41b622
This provides us one full flush in flight, with one pending as well,
Mel Gorman 41b622
and makes for more efficient handling of this type of writeback.
Mel Gorman 41b622
Mel Gorman 41b622
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Mel Gorman 41b622
Tested-by: Chris Mason <clm@fb.com>
Mel Gorman 41b622
Reviewed-by: Jan Kara <jack@suse.cz>
Mel Gorman 41b622
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Mel Gorman 41b622
Signed-off-by: Mel Gorman <mgorman@suse.de>
Mel Gorman 41b622
---
Mel Gorman 41b622
 fs/fs-writeback.c                | 25 +++++++++++++++++++++++++
Mel Gorman 41b622
 include/linux/backing-dev-defs.h |  1 +
Mel Gorman 41b622
 2 files changed, 26 insertions(+)
Mel Gorman 41b622
Mel Gorman 41b622
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
Mel Gorman 41b622
index e8799b0b9727..b33ca14d8531 100644
Mel Gorman 41b622
--- a/fs/fs-writeback.c
Mel Gorman 41b622
+++ b/fs/fs-writeback.c
Mel Gorman 41b622
@@ -53,6 +53,7 @@ struct wb_writeback_work {
Mel Gorman 41b622
 	unsigned int for_background:1;
Mel Gorman 41b622
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
Mel Gorman 41b622
 	unsigned int auto_free:1;	/* free on completion */
Mel Gorman 41b622
+	unsigned int start_all:1;	/* nr_pages == 0 (all) writeback */
Mel Gorman 41b622
 	enum wb_reason reason;		/* why was writeback initiated? */
Mel Gorman 41b622
 
Mel Gorman 41b622
 	struct list_head list;		/* pending work list */
Mel Gorman 41b622
@@ -951,6 +952,20 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
Mel Gorman 41b622
 	if (!wb_has_dirty_io(wb))
Mel Gorman 41b622
 		return;
Mel Gorman 41b622
 
Mel Gorman 41b622
+	/*
Mel Gorman 41b622
+	 * All callers of this function want to start writeback of all
Mel Gorman 41b622
+	 * dirty pages. Places like vmscan can call this at a very
Mel Gorman 41b622
+	 * high frequency, causing pointless allocations of tons of
Mel Gorman 41b622
+	 * work items and keeping the flusher threads busy retrieving
Mel Gorman 41b622
+	 * that work. Ensure that we only allow one of them pending and
Mel Gorman 41b622
+	 * inflight at the time. It doesn't matter if we race a little
Mel Gorman 41b622
+	 * bit on this, so use the faster separate test/set bit variants.
Mel Gorman 41b622
+	 */
Mel Gorman 41b622
+	if (test_bit(WB_start_all, &wb->state))
Mel Gorman 41b622
+		return;
Mel Gorman 41b622
+
Mel Gorman 41b622
+	set_bit(WB_start_all, &wb->state);
Mel Gorman 41b622
+
Mel Gorman 41b622
 	/*
Mel Gorman 41b622
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
Mel Gorman 41b622
 	 * wakeup the thread for old dirty data writeback
Mel Gorman 41b622
@@ -958,6 +973,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
Mel Gorman 41b622
 	work = kzalloc(sizeof(*work),
Mel Gorman 41b622
 		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
Mel Gorman 41b622
 	if (!work) {
Mel Gorman 41b622
+		clear_bit(WB_start_all, &wb->state);
Mel Gorman 41b622
 		trace_writeback_nowork(wb);
Mel Gorman 41b622
 		wb_wakeup(wb);
Mel Gorman 41b622
 		return;
Mel Gorman 41b622
@@ -968,6 +984,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
Mel Gorman 41b622
 	work->range_cyclic = 1;
Mel Gorman 41b622
 	work->reason	= reason;
Mel Gorman 41b622
 	work->auto_free	= 1;
Mel Gorman 41b622
+	work->start_all = 1;
Mel Gorman 41b622
 
Mel Gorman 41b622
 	wb_queue_work(wb, work);
Mel Gorman 41b622
 }
Mel Gorman 41b622
@@ -1821,6 +1838,14 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
Mel Gorman 41b622
 		list_del_init(&work->list);
Mel Gorman 41b622
 	}
Mel Gorman 41b622
 	spin_unlock_bh(&wb->work_lock);
Mel Gorman 41b622
+
Mel Gorman 41b622
+	/*
Mel Gorman 41b622
+	 * Once we start processing a work item that had !nr_pages,
Mel Gorman 41b622
+	 * clear the wb state bit for that so we can allow more.
Mel Gorman 41b622
+	 */
Mel Gorman 41b622
+	if (work && work->start_all)
Mel Gorman 41b622
+		clear_bit(WB_start_all, &wb->state);
Mel Gorman 41b622
+
Mel Gorman 41b622
 	return work;
Mel Gorman 41b622
 }
Mel Gorman 41b622
 
Mel Gorman 41b622
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
Mel Gorman 41b622
index 866c433e7d32..420de5c7c7f9 100644
Mel Gorman 41b622
--- a/include/linux/backing-dev-defs.h
Mel Gorman 41b622
+++ b/include/linux/backing-dev-defs.h
Mel Gorman 41b622
@@ -24,6 +24,7 @@ enum wb_state {
Mel Gorman 41b622
 	WB_shutting_down,	/* wb_shutdown() in progress */
Mel Gorman 41b622
 	WB_writeback_running,	/* Writeback is in progress */
Mel Gorman 41b622
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
Mel Gorman 41b622
+	WB_start_all,		/* nr_pages == 0 (all) work pending */
Mel Gorman 41b622
 };
Mel Gorman 41b622
 
Mel Gorman 41b622
 enum wb_congested_state {