Blob Blame History Raw
From: Kurt Garloff <garloff@suse.de>
Subject: Make pagecache limit behavior w.r.t. dirty pages configurable
References: FATE309111
Patch-mainline: Never, SUSE specific

The last fixes to this patchset ensured that we don't end up calling
shrink_page_cache() [from add_to_page_cache()] again and again without
the ability to actually free something. For this reason we subtracted
the dirty pages from the list of freeable unmapped pages in the 
calculation. 

With this additional patch, a new sysctl
/proc/sys/vm/pagecache_limit_ignore_dirty
is introduced. With the default setting (1), behavior does not change.
When setting it to 0, we actually consider all of the dirty pages
freeable -- we then allow for a third pass in shrink_page_cache, where
we allow writing out pages (if the gfp_mask allows it).
The value can be set to values above 1 as well; with the value set to 2,
we consider half of the dirty pages freeable etc.

Signed-off-by: Kurt Garloff <garloff@suse.de>

---
 Documentation/vm/pagecache-limit |   13 +++++++++++--
 include/linux/swap.h             |    1 +
 kernel/sysctl.c                  |    7 +++++++
 mm/page_alloc.c                  |    6 ++++--
 mm/vmscan.c                      |    9 +++++++++
 5 files changed, 32 insertions(+), 4 deletions(-)

--- a/Documentation/vm/pagecache-limit
+++ b/Documentation/vm/pagecache-limit
@@ -1,6 +1,6 @@
 Functionality:
 -------------
-The patch introduces a new tunable in the proc filesystem:
+The patch introduces two new tunables in the proc filesystem:
 
 /proc/sys/vm/pagecache_limit_mb
 
@@ -15,6 +15,13 @@ As we only consider pagecache pages that
 NOTE: The real limit depends on the amount of free memory. Every existing free page allows the page cache to grow 8x the amount of free memory above the set baseline. As soon as the free memory is needed, we free up page cache.
 
 
+/proc/sys/vm/pagecache_limit_ignore_dirty
+
+The default for this setting is 1; this means that we don't consider dirty memory to be part of the limited pagecache, as we can not easily free up dirty memory (we'd need to do writes for this). By setting this to 0, we actually consider dirty (unampped) memory to be freeable and do a third pass in shrink_page_cache() where we schedule the pages for writeout. Values larger than 1 are also possible and result in a fraction of the dirty pages to be considered non-freeable.
+
+
+
+
 How it works:
 ------------
 The heart of this patch is a new function called shrink_page_cache(). It is called from balance_pgdat (which is the worker for kswapd) if the pagecache is above the limit.
@@ -27,7 +34,9 @@ shrink_page_cache does several passes:
   This is fast -- but it might not find enough free pages; if that happens,
   the second pass will happen
 - In the second pass, pages from active list will also be considered.
-- The third pass is just another round of the second pass
+- The third pass will only happen if pagecacahe_limig_ignore-dirty is not 1.
+  In that case, the third pass is a repetition of the second pass, but this
+  time we allow pages to be written out.
 
 In all passes, only unmapped pages will be considered.
 
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -336,6 +336,7 @@ extern int vm_swappiness;
 extern unsigned long pagecache_over_limit(void);
 extern void shrink_page_cache(gfp_t mask, struct page *page);
 extern unsigned int vm_pagecache_limit_mb;
+extern unsigned int vm_pagecache_ignore_dirty;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1377,6 +1377,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.procname	= "pagecache_limit_ignore_dirty",
+		.data		= &vm_pagecache_ignore_dirty,
+		.maxlen		= sizeof(vm_pagecache_ignore_dirty),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_HUGETLB_PAGE
 	{
 		.procname	= "nr_hugepages",
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7809,12 +7809,14 @@ unsigned long pagecache_over_limit()
 	 * minus the dirty ones. (FIXME: pages accounted for in NR_WRITEBACK
 	 * are not on the LRU lists  any more, right?) */
 	unsigned long pgcache_lru_pages = global_page_state(NR_ACTIVE_FILE)
-				        + global_page_state(NR_INACTIVE_FILE)
-					- global_page_state(NR_FILE_DIRTY);
+				        + global_page_state(NR_INACTIVE_FILE);
 	unsigned long free_pages = global_page_state(NR_FREE_PAGES);
 	unsigned long swap_pages = total_swap_pages - atomic_long_read(&nr_swap_pages);
 	unsigned long limit;
 
+	if (vm_pagecache_ignore_dirty != 0)
+		pgcache_lru_pages -= global_page_state(NR_FILE_DIRTY)
+				     /vm_pagecache_ignore_dirty;
 	/* Paranoia */
 	if (unlikely(pgcache_lru_pages > LONG_MAX))
 		return 0;
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -150,6 +150,7 @@ struct scan_control {
  */
 int vm_swappiness = 60;
 unsigned int vm_pagecache_limit_mb __read_mostly = 0;
+unsigned int vm_pagecache_ignore_dirty __read_mostly = 1;
 /*
  * The total number of pages which are beyond the high watermark within all
  * zones.
@@ -3817,6 +3818,14 @@ static void __shrink_page_cache(gfp_t ma
 			if (ret >= nr_pages)
 				return;
 		}
+
+		if (pass == 1) {
+			if (vm_pagecache_ignore_dirty == 1 ||
+			    (mask & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS) )
+				break;
+			else
+				sc.may_writepage = 1;
+		}
 	}
 }