a9c5d6
From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001
a9c5d6
From: Charan Teja Reddy <charante@codeaurora.org>
a9c5d6
Date: Thu, 2 Sep 2021 14:59:59 -0700
a9c5d6
Subject: [PATCH] mm: compaction: support triggering of proactive compaction by
a9c5d6
 user
a9c5d6
References: bsc#1207010
a9c5d6
Git-commit: 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9
a9c5d6
Patch-mainline: v5.15-rc1
a9c5d6
a9c5d6
The proactive compaction[1] gets triggered for every 500msec and run
a9c5d6
compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages
a9c5d6
based on the value set to sysctl.compaction_proactiveness.  Triggering the
a9c5d6
compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is
a9c5d6
not needed for all applications, especially on the embedded system
a9c5d6
usecases which may have few MB's of RAM.  Enabling the proactive
a9c5d6
compaction in its state will endup in running almost always on such
a9c5d6
systems.
a9c5d6
a9c5d6
Other side, proactive compaction can still be very much useful for getting
a9c5d6
a set of higher order pages in some controllable manner(controlled by
a9c5d6
using the sysctl.compaction_proactiveness).  So, on systems where enabling
a9c5d6
the proactive compaction always may proove not required, can trigger the
a9c5d6
same from user space on write to its sysctl interface.  As an example, say
a9c5d6
app launcher decide to launch the memory heavy application which can be
a9c5d6
launched fast if it gets more higher order pages thus launcher can prepare
a9c5d6
the system in advance by triggering the proactive compaction from
a9c5d6
userspace.
a9c5d6
a9c5d6
This triggering of proactive compaction is done on a write to
a9c5d6
sysctl.compaction_proactiveness by user.
a9c5d6
a9c5d6
[1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a
a9c5d6
a9c5d6
[akpm@linux-foundation.org: tweak vm.rst, per Mike]
a9c5d6
a9c5d6
Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org
a9c5d6
Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
a9c5d6
Acked-by: Vlastimil Babka <vbabka@suse.cz>
a9c5d6
Acked-by: Rafael Aquini <aquini@redhat.com>
a9c5d6
Cc: Mike Rapoport <rppt@kernel.org>
a9c5d6
Cc: Luis Chamberlain <mcgrof@kernel.org>
a9c5d6
Cc: Kees Cook <keescook@chromium.org>
a9c5d6
Cc: Iurii Zaikin <yzaikin@google.com>
a9c5d6
Cc: Dave Hansen <dave.hansen@linux.intel.com>
a9c5d6
Cc: Mel Gorman <mgorman@techsingularity.net>
a9c5d6
Cc: Nitin Gupta <nigupta@nvidia.com>
a9c5d6
Cc: Jonathan Corbet <corbet@lwn.net>
a9c5d6
Cc: Khalid Aziz <khalid.aziz@oracle.com>
a9c5d6
Cc: David Rientjes <rientjes@google.com>
a9c5d6
Cc: Vinayak Menon <vinmenon@codeaurora.org>
a9c5d6
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
a9c5d6
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
a9c5d6
Signed-off-by: Thomas Abraham <tabraham@suse.com>
a9c5d6
---
a9c5d6
 Documentation/admin-guide/sysctl/vm.rst |  3 +-
a9c5d6
 include/linux/compaction.h              |  2 ++
a9c5d6
 include/linux/mmzone.h                  |  1 +
a9c5d6
 kernel/sysctl.c                         |  2 +-
a9c5d6
 mm/compaction.c                         | 38 +++++++++++++++++++++++--
a9c5d6
 5 files changed, 42 insertions(+), 4 deletions(-)
a9c5d6
a9c5d6
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
a9c5d6
index 003d5cc3751b..5e795202111f 100644
a9c5d6
--- a/Documentation/admin-guide/sysctl/vm.rst
a9c5d6
+++ b/Documentation/admin-guide/sysctl/vm.rst
a9c5d6
@@ -118,7 +118,8 @@ compaction_proactiveness
a9c5d6
 
a9c5d6
 This tunable takes a value in the range [0, 100] with a default value of
a9c5d6
 20. This tunable determines how aggressively compaction is done in the
a9c5d6
-background. Setting it to 0 disables proactive compaction.
a9c5d6
+background. Write of a non zero value to this tunable will immediately
a9c5d6
+trigger the proactive compaction. Setting it to 0 disables proactive compaction.
a9c5d6
 
a9c5d6
 Note that compaction has a non-trivial system-wide impact as pages
a9c5d6
 belonging to different processes are moved around, which could also lead
a9c5d6
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
a9c5d6
index c24098c7acca..34bce35c808d 100644
a9c5d6
--- a/include/linux/compaction.h
a9c5d6
+++ b/include/linux/compaction.h
a9c5d6
@@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
a9c5d6
 extern unsigned int sysctl_compaction_proactiveness;
a9c5d6
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
a9c5d6
 			void *buffer, size_t *length, loff_t *ppos);
a9c5d6
+extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
a9c5d6
+		int write, void *buffer, size_t *length, loff_t *ppos);
a9c5d6
 extern int sysctl_extfrag_threshold;
a9c5d6
 extern int sysctl_compact_unevictable_allowed;
a9c5d6
 
a9c5d6
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
a9c5d6
index 59bad25ce78e..1bd5f5955f9a 100644
a9c5d6
--- a/include/linux/mmzone.h
a9c5d6
+++ b/include/linux/mmzone.h
a9c5d6
@@ -846,6 +846,7 @@ typedef struct pglist_data {
a9c5d6
 	enum zone_type kcompactd_highest_zoneidx;
a9c5d6
 	wait_queue_head_t kcompactd_wait;
a9c5d6
 	struct task_struct *kcompactd;
a9c5d6
+	bool proactive_compact_trigger;
a9c5d6
 #endif
a9c5d6
 	/*
a9c5d6
 	 * This is a per-node reserve of pages that are not available
a9c5d6
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
a9c5d6
index 272f4a272f8c..297f0b3966bd 100644
a9c5d6
--- a/kernel/sysctl.c
a9c5d6
+++ b/kernel/sysctl.c
a9c5d6
@@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = {
a9c5d6
 		.data		= &sysctl_compaction_proactiveness,
a9c5d6
 		.maxlen		= sizeof(sysctl_compaction_proactiveness),
a9c5d6
 		.mode		= 0644,
a9c5d6
-		.proc_handler	= proc_dointvec_minmax,
a9c5d6
+		.proc_handler	= compaction_proactiveness_sysctl_handler,
a9c5d6
 		.extra1		= SYSCTL_ZERO,
a9c5d6
 		.extra2		= &one_hundred,
a9c5d6
 	},
a9c5d6
diff --git a/mm/compaction.c b/mm/compaction.c
a9c5d6
index 4ee0d40d93f2..fa9b2b598eab 100644
a9c5d6
--- a/mm/compaction.c
a9c5d6
+++ b/mm/compaction.c
a9c5d6
@@ -2706,6 +2706,30 @@ static void compact_nodes(void)
a9c5d6
  */
a9c5d6
 unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
a9c5d6
 
a9c5d6
+int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
a9c5d6
+		void *buffer, size_t *length, loff_t *ppos)
a9c5d6
+{
a9c5d6
+	int rc, nid;
a9c5d6
+
a9c5d6
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
a9c5d6
+	if (rc)
a9c5d6
+		return rc;
a9c5d6
+
a9c5d6
+	if (write && sysctl_compaction_proactiveness) {
a9c5d6
+		for_each_online_node(nid) {
a9c5d6
+			pg_data_t *pgdat = NODE_DATA(nid);
a9c5d6
+
a9c5d6
+			if (pgdat->proactive_compact_trigger)
a9c5d6
+				continue;
a9c5d6
+
a9c5d6
+			pgdat->proactive_compact_trigger = true;
a9c5d6
+			wake_up_interruptible(&pgdat->kcompactd_wait);
a9c5d6
+		}
a9c5d6
+	}
a9c5d6
+
a9c5d6
+	return 0;
a9c5d6
+}
a9c5d6
+
a9c5d6
 /*
a9c5d6
  * This is the entry point for compacting all nodes via
a9c5d6
  * /proc/sys/vm/compact_memory
a9c5d6
@@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node)
a9c5d6
 
a9c5d6
 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
a9c5d6
 {
a9c5d6
-	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
a9c5d6
+	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
a9c5d6
+		pgdat->proactive_compact_trigger;
a9c5d6
 }
a9c5d6
 
a9c5d6
 static bool kcompactd_node_suitable(pg_data_t *pgdat)
a9c5d6
@@ -2901,9 +2926,16 @@ static int kcompactd(void *p)
a9c5d6
 	while (!kthread_should_stop()) {
a9c5d6
 		unsigned long pflags;
a9c5d6
 
a9c5d6
+		/*
a9c5d6
+		 * Avoid the unnecessary wakeup for proactive compaction
a9c5d6
+		 * when it is disabled.
a9c5d6
+		 */
a9c5d6
+		if (!sysctl_compaction_proactiveness)
a9c5d6
+			timeout = MAX_SCHEDULE_TIMEOUT;
a9c5d6
 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
a9c5d6
 		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
a9c5d6
-			kcompactd_work_requested(pgdat), timeout)) {
a9c5d6
+			kcompactd_work_requested(pgdat), timeout) &&
a9c5d6
+			!pgdat->proactive_compact_trigger) {
a9c5d6
 
a9c5d6
 			psi_memstall_enter(&pflags);
a9c5d6
 			kcompactd_do_work(pgdat);
a9c5d6
@@ -2938,6 +2970,8 @@ static int kcompactd(void *p)
a9c5d6
 				timeout =
a9c5d6
 				   default_timeout << COMPACT_MAX_DEFER_SHIFT;
a9c5d6
 		}
a9c5d6
+		if (unlikely(pgdat->proactive_compact_trigger))
a9c5d6
+			pgdat->proactive_compact_trigger = false;
a9c5d6
 	}
a9c5d6
 
a9c5d6
 	return 0;
a9c5d6
-- 
a9c5d6
2.35.3
a9c5d6