Blob Blame History Raw
From c622a092a26996c46d01107cdbf9d3b4a15af046 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 5 Nov 2021 10:08:26 +0000
Subject: [PATCH] sched: Temporarily restore deprecated scheduler sysctls with
 a warning

References: bsc#1192327, bsc#1191396
Patch-mainline: Never, upstream wants bug reports to improve the default CPU scheduler

Commit 8a99b6833c88 ("sched: Move SCHED_DEBUG sysctl to debugfs") removed
the following sysctls in kernel 5.13;

	sched_min_granularity_ns sched_latency_ns sched_wakeup_granularity_ns sched_tunable_scaling sched_migration_cost_ns sched_nr_migrate numa_balancing_scan_delay_ms numa_balancing_scan_period_min_ms numa_balancing_scan_period_max_ms numa_balancing_scan_size_mb

They were removed as their behaviour is undocumented, poorly understood,
can have unexpected side-effects and in most cases the change is to
workaround a CPU scheduler corner case. In other cases, the exact value
used is very machine and workload dependant.

Upstream is unlikely to ever revert the changes and instead wants bug
reports to fix any CPU scheduler corner case that is discovered. This
patch is intended to give customers a grace period for two service
packs to determine why these sysctls improve performance in a given
case and work with SUSE to get the problems fixed upstream before
the next major kernel revision.

Before behaviour
----------------
host:~/:[0]# sysctl sched_min_granularity_ns
sysctl: cannot stat /proc/sys/kernel/sched_min_granularity_ns: No such file or directory
host:~/:[255]#

After behaviour
---------------
host:~/:[130]# sysctl sched_min_granularity_ns
sched_min_granularity_ns = 3000000
host:~/:[0]# sysctl sched_min_granularity_ns=4000000
sched_min_granularity_ns = 4000000
host:~/:[0]# dmesg | tail -1
[  131.110313] The sched.sched_min_granularity_ns sysctl was moved to debugfs in kernel 5.13 for CPU scheduler debugging only. This sysctl will be removed in a future SLE release.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/sched/sysctl.h |  18 +++++++
 kernel/sched/fair.c          |  66 ++++++++++++++++++++++++++
 kernel/sysctl.c              | 110 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 194 insertions(+)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 304f431178fd..05f99aa4a388 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -81,4 +81,22 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+int sched_deprecated_proc_update_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+int sched_warn_deprecated_proc_uint_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos);
+
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbfd83bdf3de..1ff3414c2dbb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -646,6 +646,72 @@ int sched_update_scaling(void)
 }
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_NR_DEPRECATED 10
+static char *sched_proc_deprecated[SCHED_NR_DEPRECATED] = {
+	"sched_min_granularity_ns",
+	"sched_latency_ns",
+	"sched_wakeup_granularity_ns",
+	"sched_tunable_scaling",
+	"sched_migration_cost_ns",
+	"sched_nr_migrate",
+	"numa_balancing_scan_delay_ms",
+	"numa_balancing_scan_period_min_ms",
+	"numa_balancing_scan_period_max_ms",
+	"numa_balancing_scan_size_mb",
+};
+
+static bool sched_proc_deprecated_warned[SCHED_NR_DEPRECATED];
+
+static void warn_proc_deprecated(const char *procname)
+{
+	int i;
+
+	for (i = 0; i < SCHED_NR_DEPRECATED; i++) {
+		if (!strcmp(procname, sched_proc_deprecated[i]))
+			break;
+	}
+
+	/*
+	 * Warn once that the sysctl will be removed in a future SLE release.
+	 * Bugs in relation to this should gather details on what the workload
+	 * that requires this to be set to determine if the default scheduler
+	 * behaviour can be improved.
+	 */
+	if (i < SCHED_NR_DEPRECATED && !sched_proc_deprecated_warned[i]) {
+		pr_warn("The sched.%s sysctl was moved to debugfs in kernel "
+			"5.13 for CPU scheduler debugging only. This sysctl "
+			"will be removed in a future SLE release.\n",
+			procname);
+		sched_proc_deprecated_warned[i] = true;
+	}
+}
+
+int sched_deprecated_proc_update_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	warn_proc_deprecated(table->procname);
+	return sched_update_scaling();
+}
+
+int sched_warn_deprecated_proc_uint_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return 0;
+
+	warn_proc_deprecated(table->procname);
+	return ret;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
 /*
  * delta /= w
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 64e2a090f8bc..e51d95a77f33 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -189,6 +189,17 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
 int sysctl_legacy_va_layout;
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000;		/* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
+static int min_wakeup_granularity_ns;			/* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
+
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
@@ -1737,6 +1748,105 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * WARNING: DEPRECATED
+	 *
+	 * These sysctls no longer exist upstream and are being preserved only
+	 * for SLE 15 SP4 and SLE 15 SP5 with a warning displayed once if they
+	 * are used. Bugs should be filed if these tuning parameters are
+	 * necessary to determine if the default scheduler behaviour can be
+	 * improved.
+	 */
+	{
+		.procname	= "sched_min_granularity_ns",
+		.data		= &sysctl_sched_min_granularity,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_deprecated_proc_update_handler,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
+	{
+		.procname	= "sched_latency_ns",
+		.data		= &sysctl_sched_latency,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_deprecated_proc_update_handler,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
+	{
+		.procname	= "sched_wakeup_granularity_ns",
+		.data		= &sysctl_sched_wakeup_granularity,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_deprecated_proc_update_handler,
+		.extra1		= &min_wakeup_granularity_ns,
+		.extra2		= &max_wakeup_granularity_ns,
+	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "sched_tunable_scaling",
+		.data		= &sysctl_sched_tunable_scaling,
+		.maxlen		= sizeof(enum sched_tunable_scaling),
+		.mode		= 0644,
+		.proc_handler	= sched_deprecated_proc_update_handler,
+		.extra1		= &min_sched_tunable_scaling,
+		.extra2		= &max_sched_tunable_scaling,
+	},
+	{
+		.procname	= "sched_migration_cost_ns",
+		.data		= &sysctl_sched_migration_cost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_nr_migrate",
+		.data		= &sysctl_sched_nr_migrate,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+	{
+		.procname	= "numa_balancing_scan_delay_ms",
+		.data		= &sysctl_numa_balancing_scan_delay,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "numa_balancing_scan_period_min_ms",
+		.data		= &sysctl_numa_balancing_scan_period_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "numa_balancing_scan_period_max_ms",
+		.data		= &sysctl_numa_balancing_scan_period_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "numa_balancing_scan_size_mb",
+		.data		= &sysctl_numa_balancing_scan_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_warn_deprecated_proc_uint_handler,
+		.extra1		= SYSCTL_ZERO,
+	},
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",