From 8be5542de77e864bd47aaa225f41da8eeba3659d Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Oct 02 2020 10:32:28 +0000
Subject: Merge branch 'users/mgorman/SLE15-SP2/for-next' into SLE15-SP2


Pull performance fixes from Mel Gorman.

---

diff --git a/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch b/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch
new file mode 100644
index 0000000..f3ee3ed
--- /dev/null
+++ b/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch
@@ -0,0 +1,93 @@
+From: mgorman <mgorman@suse.com>
+Date: Fri, 25 Sep 2020 09:32:32 +0100
+Subject: [PATCH] cpuidle: Poll for a minimum of 30ns and poll for a tick if
+ lower c-states are disabled
+
+References: bnc#1176588
+Patch-mainline: Not yet, needs to be posted but will likely be rejected for favoring power over performance
+
+A bug was reported against a distribution kernel about a regression
+related to an application that has very large numbers of threads operating
+on large amounts of memory with a mix of page faults and address space
+modifications. The threads enter/exit idle states extremely rapidly and
+perf indicated that a large amount of time was spent on native_safe_halt.
+The application requires that cpuidle states be limited to C1 to reduce
+latencies on wakeup.
+
+The problem is that the application indirectly relied on similar behaviour
+to commit 36fcb4292473 ("cpuidle: use first valid target residency as
+poll time") where CPUs would poll to the lowest C-state exit latency
+before exiting. As low c-states, the application more directly relies
+on a37b969a61c1 ("cpuidle: poll_state: Add time limit to poll_idle()")
+to poll a CPU until a rescheduling event occurred.
+
+Rewinding this back "works" but is extreme. Instead this patch sets a
+baseline polling time that is close to the C2 exit latency and anecdotally
+is a common target as a wakeup latency. It guesses if lower C-states have
+been disabled and if so, it polls until the rescheduling event or a tick
+has passed. It's unlikely a tick will pass but it avoids the corner case
+commit a37b969a61c1 ("cpuidle: poll_state: Add time limit to poll_idle()")
+intended to avoid.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ drivers/cpuidle/cpuidle.c | 28 ++++++++++++++++++++++++----
+ 1 file changed, 24 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
+index 29d2d7a21bd7..b903016e653b 100644
+--- a/drivers/cpuidle/cpuidle.c
++++ b/drivers/cpuidle/cpuidle.c
+@@ -361,6 +361,8 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
+ 		cpuidle_curr_governor->reflect(dev, index);
+ }
+ 
++#define MIN_POLL_TIME (30 * NSEC_PER_USEC)
++
+ /**
+  * cpuidle_poll_time - return amount of time to poll for,
+  * governors can override dev->poll_limit_ns if necessary
+@@ -373,21 +375,39 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv,
+ 		      struct cpuidle_device *dev)
+ {
+ 	int i;
+-	u64 limit_ns;
++	u64 limit_ns, max_limit;
+ 
+ 	if (dev->poll_limit_ns)
+ 		return dev->poll_limit_ns;
+ 
+ 	limit_ns = TICK_NSEC;
++	max_limit = 0;
+ 	for (i = 1; i < drv->state_count; i++) {
++		u64 state_limit;
++
+ 		if (drv->states[i].disabled || dev->states_usage[i].disable)
+ 			continue;
+ 
+-		limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC;
+-		break;
++		state_limit = (u64)drv->states[i].target_residency * NSEC_PER_USEC;
++		if (limit_ns == TICK_NSEC)
++			limit_ns = state_limit;
++		max_limit = state_limit;
++	}
++
++	dev->poll_limit_ns = max_t(u64, MIN_POLL_TIME, limit_ns);
++
++	/*
++	 * If the deepest state is below the minimum, assume that c-states
++	 * are limited by the driver or kernel command line and that latency
++	 * is a concern. In this case, poll for longer periods;
++	 */
++	if (max_limit < MIN_POLL_TIME) {
++		pr_info("cpuidle deepest latency of %llu below min %llu, idling based on tick\n",
++			max_limit, (u64)MIN_POLL_TIME);
++		dev->poll_limit_ns = TICK_NSEC;
+ 	}
+ 
+-	dev->poll_limit_ns = limit_ns;
++	pr_info("cpuidle polling time = %llu ns\n", dev->poll_limit_ns);
+ 
+ 	return dev->poll_limit_ns;
+ }
diff --git a/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch b/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch
new file mode 100644
index 0000000..7bea0c8
--- /dev/null
+++ b/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch
@@ -0,0 +1,46 @@
+From dc02db32c1549120c1dc8ce80c20a9091a92d8cf Mon Sep 17 00:00:00 2001
+From: Davidlohr Bueso <dbueso@suse.de>
+Date: Thu, 1 Oct 2020 09:01:42 -0700
+Subject: [PATCH] locking/rwsem: Disable reader optimistic spinning
+
+References: bnc#1176588
+Patch-mainline: Not yet, would need to be posted for discussion
+
+Reader spinning can cause performance issues in workloads that
+could otherwise benefit if the rwsem waiter would immediately
+block instead of busy waiting upon a contended lock. In addition
+there can be architectural differences in the spinning that can
+further augment such problems such as pause latencies and the
+overall cost of cacheline bouncing when trying to take the rwsem.
+
+This patch restores behavior closer to SLE15-SP1 where only
+writers can spin, alleviating a lot of the contention (and length)
+of the MCS queue. Writers can still spin on a reader-owned lock
+until timedout.
+
+Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ kernel/locking/rwsem.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
+index baafa1dd9fcc..fa02c9581a2b 100644
+--- a/kernel/locking/rwsem.c
++++ b/kernel/locking/rwsem.c
+@@ -662,6 +662,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ 
+ 	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
+ 
++	/*
++	 * Force readers down the slowpath.
++	 */
++	if (nonspinnable == RWSEM_RD_NONSPINNABLE)
++		return false;
++
+ 	if (need_resched()) {
+ 		lockevent_inc(rwsem_opt_fail);
+ 		return false;
+-- 
+2.26.2
+
diff --git a/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch b/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch
new file mode 100644
index 0000000..7dc8e3c
--- /dev/null
+++ b/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch
@@ -0,0 +1,111 @@
+From: mgorman <mgorman@suse.com>
+Date: Wed, 23 Sep 2020 18:10:35 +0100
+Subject: [PATCH] sched/numa: Avoid creating large imbalances at task creation
+ time
+
+References: bnc#1176588
+Patch-mainline: Not yet, needs to be posted but likely needs modification
+
+A bug was reported against a distribution kernel about a regression related
+to an application that has very large numbers of threads operating on
+large amounts of memory with a mix of page faults and address space
+modifications. System CPU usage was higher and elapsed time was much
+reduced.
+
+Part of the problem is that the application relies on threads and their
+placement. As cloned threads remain local to the node if there is an
+idle CPU, they contend heavily on the LRU spinlock when faulting
+memory. It also creates a large imbalance of CPU utilisation until
+the load balancer which does not happen quickly enough. As NUMA
+balancing is disabled as the application is partially NUMA aware,
+the situation never recovers.
+
+This is not a representative test case, but similar symptoms can
+be seen with a benchmark that faults pages in parallel
+
+			    baseline		   patch
+Amean     system-1          4.36 (   0.00%)        4.33 *   0.88%*
+Amean     system-4          4.49 (   0.00%)        4.44 *   1.00%*
+Amean     system-7          4.80 (   0.00%)        4.67 *   2.80%*
+Amean     system-12         5.05 (   0.00%)        4.94 *   2.14%*
+Amean     system-21         7.98 (   0.00%)        5.10 *  36.04%*
+Amean     system-30         8.45 (   0.00%)        6.44 *  23.79%*
+Amean     system-48         9.40 (   0.00%)        9.38 (   0.24%)
+Amean     elapsed-1         5.70 (   0.00%)        5.68 *   0.45%*
+Amean     elapsed-4         1.48 (   0.00%)        1.47 *   0.70%*
+Amean     elapsed-7         0.92 (   0.00%)        0.89 *   3.10%*
+Amean     elapsed-12        0.57 (   0.00%)        0.55 *   2.94%*
+Amean     elapsed-21        0.51 (   0.00%)        0.33 *  34.38%*
+Amean     elapsed-30        0.39 (   0.00%)        0.35 *  10.27%*
+Amean     elapsed-48        0.24 (   0.00%)        0.25 (  -2.08%)
+
+The system has 48 cores in total, note the decrease in system CPU
+usage and the large decrease in elapsed time when one node is
+almost full.
+
+This imbalance is not the best possible solution. Ideally it would
+be reconciled with adjust_numa_imbalance() but that was a regression
+magnet when it first tried to allow imbalances. This is the minimal
+fix to act as a baseline before trying to reconcile all the imbalance
+handling cross nodes.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ kernel/sched/fair.c | 27 ++++++++++++++++++++-------
+ 1 file changed, 20 insertions(+), 7 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index db02212b2fba..1a3984801cc9 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8679,9 +8679,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ 			.group_type = group_overloaded,
+ 	};
+ 
+-	imbalance = scale_load_down(NICE_0_LOAD) *
+-				(sd->imbalance_pct-100) / 100;
+-
+ 	do {
+ 		int local_group;
+ 
+@@ -8735,6 +8732,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ 	switch (local_sgs.group_type) {
+ 	case group_overloaded:
+ 	case group_fully_busy:
++
++		/* Calculate allowed imbalance based on load */
++		imbalance = scale_load_down(NICE_0_LOAD) *
++				(sd->imbalance_pct-100) / 100;
++
+ 		/*
+ 		 * When comparing groups across NUMA domains, it's possible for
+ 		 * the local domain to be very lightly loaded relative to the
+@@ -8787,13 +8789,24 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ 					return idlest;
+ 			}
+ #endif
++
+ 			/*
+ 			 * Otherwise, keep the task on this node to stay close
+-			 * its wakeup source and improve locality. If there is
+-			 * a real need of migration, periodic load balance will
+-			 * take care of it.
++			 * to its wakeup source if it would not cause a large
++			 * imbalance. If there is a real need of migration,
++			 * periodic load balance will take care of it.
++			 */
++
++			/* See adjust_numa_imbalance */
++			imbalance = 2;
++
++			/*
++			 * Allow an imbalance if the node is not nearly full
++			 * and the imbalance between local and idlest is not
++			 * excessive.
+ 			 */
+-			if (local_sgs.idle_cpus)
++			if (local_sgs.idle_cpus >= imbalance &&
++			    idlest_sgs.idle_cpus - local_sgs.idle_cpus <= imbalance)
+ 				return NULL;
+ 		}
+ 
diff --git a/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch b/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch
new file mode 100644
index 0000000..bd9ae04
--- /dev/null
+++ b/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch
@@ -0,0 +1,52 @@
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 23 Sep 2020 16:59:29 +0100
+Subject: [PATCH] sched/numa: Check numa balancing information only when
+ enabled
+
+References: bnc#1176588
+Patch-mainline: Not yet, needs to be posted with the follow-on patch
+
+When selecting the first CPU for a task to run, the NUMA balancing
+information will be used if it is available. Avoid making the
+checks if NUMA balancing is disabled. This is of marginal interest,
+just popped out while preparing the second patch.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+---
+ kernel/sched/fair.c | 22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 327e3325c396..db02212b2fba 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8773,17 +8773,19 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ 	case group_has_spare:
+ 		if (sd->flags & SD_NUMA) {
+ #ifdef CONFIG_NUMA_BALANCING
+-			int idlest_cpu;
+-			/*
+-			 * If there is spare capacity at NUMA, try to select
+-			 * the preferred node
+-			 */
+-			if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+-				return NULL;
++			if (static_branch_likely(&sched_numa_balancing)) {
++				int idlest_cpu;
++				/*
++				 * If there is spare capacity at NUMA, try to select
++				 * the preferred node
++				 */
++				if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
++					return NULL;
+ 
+-			idlest_cpu = cpumask_first(sched_group_span(idlest));
+-			if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+-				return idlest;
++				idlest_cpu = cpumask_first(sched_group_span(idlest));
++				if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
++					return idlest;
++			}
+ #endif
+ 			/*
+ 			 * Otherwise, keep the task on this node to stay close
diff --git a/series.conf b/series.conf
index 15de97a..4fa2dea 100644
--- a/series.conf
+++ b/series.conf
@@ -15133,6 +15133,12 @@
 	patches.suse/sched-fair-Clear-SMT-siblings-after-determining-the-core-is-not-idle.patch
 	patches.suse/sched-nohz-Avoid-disabling-the-tick-for-very-short-durations.patch
 
+	# bnc#1176588
+	patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch
+	patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch
+	patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch
+	patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch
+
 	########################################################
 	# Memory management
 	########################################################