From 8be5542de77e864bd47aaa225f41da8eeba3659d Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Oct 02 2020 10:32:28 +0000 Subject: Merge branch 'users/mgorman/SLE15-SP2/for-next' into SLE15-SP2 Pull performance fixes from Mel Gorman. --- diff --git a/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch b/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch new file mode 100644 index 0000000..f3ee3ed --- /dev/null +++ b/patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch @@ -0,0 +1,93 @@ +From: mgorman +Date: Fri, 25 Sep 2020 09:32:32 +0100 +Subject: [PATCH] cpuidle: Poll for a minimum of 30ns and poll for a tick if + lower c-states are disabled + +References: bnc#1176588 +Patch-mainline: Not yet, needs to be posted but will likely be rejected for favoring power over performance + +A bug was reported against a distribution kernel about a regression +related to an application that has very large numbers of threads operating +on large amounts of memory with a mix of page faults and address space +modifications. The threads enter/exit idle states extremely rapidly and +perf indicated that a large amount of time was spent on native_safe_halt. +The application requires that cpuidle states be limited to C1 to reduce +latencies on wakeup. + +The problem is that the application indirectly relied on similar behaviour +to commit 36fcb4292473 ("cpuidle: use first valid target residency as +poll time") where CPUs would poll to the lowest C-state exit latency +before exiting. As low c-states, the application more directly relies +on a37b969a61c1 ("cpuidle: poll_state: Add time limit to poll_idle()") +to poll a CPU until a rescheduling event occurred. + +Rewinding this back "works" but is extreme. Instead this patch sets a +baseline polling time that is close to the C2 exit latency and anecdotally +is a common target as a wakeup latency. It guesses if lower C-states have +been disabled and if so, it polls until the rescheduling event or a tick +has passed. It's unlikely a tick will pass but it avoids the corner case +commit a37b969a61c1 ("cpuidle: poll_state: Add time limit to poll_idle()") +intended to avoid. + +Signed-off-by: Mel Gorman +--- + drivers/cpuidle/cpuidle.c | 28 ++++++++++++++++++++++++---- + 1 file changed, 24 insertions(+), 4 deletions(-) + +diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c +index 29d2d7a21bd7..b903016e653b 100644 +--- a/drivers/cpuidle/cpuidle.c ++++ b/drivers/cpuidle/cpuidle.c +@@ -361,6 +361,8 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) + cpuidle_curr_governor->reflect(dev, index); + } + ++#define MIN_POLL_TIME (30 * NSEC_PER_USEC) ++ + /** + * cpuidle_poll_time - return amount of time to poll for, + * governors can override dev->poll_limit_ns if necessary +@@ -373,21 +375,39 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev) + { + int i; +- u64 limit_ns; ++ u64 limit_ns, max_limit; + + if (dev->poll_limit_ns) + return dev->poll_limit_ns; + + limit_ns = TICK_NSEC; ++ max_limit = 0; + for (i = 1; i < drv->state_count; i++) { ++ u64 state_limit; ++ + if (drv->states[i].disabled || dev->states_usage[i].disable) + continue; + +- limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC; +- break; ++ state_limit = (u64)drv->states[i].target_residency * NSEC_PER_USEC; ++ if (limit_ns == TICK_NSEC) ++ limit_ns = state_limit; ++ max_limit = state_limit; ++ } ++ ++ dev->poll_limit_ns = max_t(u64, MIN_POLL_TIME, limit_ns); ++ ++ /* ++ * If the deepest state is below the minimum, assume that c-states ++ * are limited by the driver or kernel command line and that latency ++ * is a concern. In this case, poll for longer periods; ++ */ ++ if (max_limit < MIN_POLL_TIME) { ++ pr_info("cpuidle deepest latency of %llu below min %llu, idling based on tick\n", ++ max_limit, (u64)MIN_POLL_TIME); ++ dev->poll_limit_ns = TICK_NSEC; + } + +- dev->poll_limit_ns = limit_ns; ++ pr_info("cpuidle polling time = %llu ns\n", dev->poll_limit_ns); + + return dev->poll_limit_ns; + } diff --git a/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch b/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch new file mode 100644 index 0000000..7bea0c8 --- /dev/null +++ b/patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch @@ -0,0 +1,46 @@ +From dc02db32c1549120c1dc8ce80c20a9091a92d8cf Mon Sep 17 00:00:00 2001 +From: Davidlohr Bueso +Date: Thu, 1 Oct 2020 09:01:42 -0700 +Subject: [PATCH] locking/rwsem: Disable reader optimistic spinning + +References: bnc#1176588 +Patch-mainline: Not yet, would need to be posted for discussion + +Reader spinning can cause performance issues in workloads that +could otherwise benefit if the rwsem waiter would immediately +block instead of busy waiting upon a contended lock. In addition +there can be architectural differences in the spinning that can +further augment such problems such as pause latencies and the +overall cost of cacheline bouncing when trying to take the rwsem. + +This patch restores behavior closer to SLE15-SP1 where only +writers can spin, alleviating a lot of the contention (and length) +of the MCS queue. Writers can still spin on a reader-owned lock +until timedout. + +Signed-off-by: Davidlohr Bueso +Signed-off-by: Mel Gorman +--- + kernel/locking/rwsem.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index baafa1dd9fcc..fa02c9581a2b 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -662,6 +662,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); + ++ /* ++ * Force readers down the slowpath. ++ */ ++ if (nonspinnable == RWSEM_RD_NONSPINNABLE) ++ return false; ++ + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); + return false; +-- +2.26.2 + diff --git a/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch b/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch new file mode 100644 index 0000000..7dc8e3c --- /dev/null +++ b/patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch @@ -0,0 +1,111 @@ +From: mgorman +Date: Wed, 23 Sep 2020 18:10:35 +0100 +Subject: [PATCH] sched/numa: Avoid creating large imbalances at task creation + time + +References: bnc#1176588 +Patch-mainline: Not yet, needs to be posted but likely needs modification + +A bug was reported against a distribution kernel about a regression related +to an application that has very large numbers of threads operating on +large amounts of memory with a mix of page faults and address space +modifications. System CPU usage was higher and elapsed time was much +reduced. + +Part of the problem is that the application relies on threads and their +placement. As cloned threads remain local to the node if there is an +idle CPU, they contend heavily on the LRU spinlock when faulting +memory. It also creates a large imbalance of CPU utilisation until +the load balancer which does not happen quickly enough. As NUMA +balancing is disabled as the application is partially NUMA aware, +the situation never recovers. + +This is not a representative test case, but similar symptoms can +be seen with a benchmark that faults pages in parallel + + baseline patch +Amean system-1 4.36 ( 0.00%) 4.33 * 0.88%* +Amean system-4 4.49 ( 0.00%) 4.44 * 1.00%* +Amean system-7 4.80 ( 0.00%) 4.67 * 2.80%* +Amean system-12 5.05 ( 0.00%) 4.94 * 2.14%* +Amean system-21 7.98 ( 0.00%) 5.10 * 36.04%* +Amean system-30 8.45 ( 0.00%) 6.44 * 23.79%* +Amean system-48 9.40 ( 0.00%) 9.38 ( 0.24%) +Amean elapsed-1 5.70 ( 0.00%) 5.68 * 0.45%* +Amean elapsed-4 1.48 ( 0.00%) 1.47 * 0.70%* +Amean elapsed-7 0.92 ( 0.00%) 0.89 * 3.10%* +Amean elapsed-12 0.57 ( 0.00%) 0.55 * 2.94%* +Amean elapsed-21 0.51 ( 0.00%) 0.33 * 34.38%* +Amean elapsed-30 0.39 ( 0.00%) 0.35 * 10.27%* +Amean elapsed-48 0.24 ( 0.00%) 0.25 ( -2.08%) + +The system has 48 cores in total, note the decrease in system CPU +usage and the large decrease in elapsed time when one node is +almost full. + +This imbalance is not the best possible solution. Ideally it would +be reconciled with adjust_numa_imbalance() but that was a regression +magnet when it first tried to allow imbalances. This is the minimal +fix to act as a baseline before trying to reconcile all the imbalance +handling cross nodes. + +Signed-off-by: Mel Gorman +--- + kernel/sched/fair.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index db02212b2fba..1a3984801cc9 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8679,9 +8679,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + .group_type = group_overloaded, + }; + +- imbalance = scale_load_down(NICE_0_LOAD) * +- (sd->imbalance_pct-100) / 100; +- + do { + int local_group; + +@@ -8735,6 +8732,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: ++ ++ /* Calculate allowed imbalance based on load */ ++ imbalance = scale_load_down(NICE_0_LOAD) * ++ (sd->imbalance_pct-100) / 100; ++ + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the +@@ -8787,13 +8789,24 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + return idlest; + } + #endif ++ + /* + * Otherwise, keep the task on this node to stay close +- * its wakeup source and improve locality. If there is +- * a real need of migration, periodic load balance will +- * take care of it. ++ * to its wakeup source if it would not cause a large ++ * imbalance. If there is a real need of migration, ++ * periodic load balance will take care of it. ++ */ ++ ++ /* See adjust_numa_imbalance */ ++ imbalance = 2; ++ ++ /* ++ * Allow an imbalance if the node is not nearly full ++ * and the imbalance between local and idlest is not ++ * excessive. + */ +- if (local_sgs.idle_cpus) ++ if (local_sgs.idle_cpus >= imbalance && ++ idlest_sgs.idle_cpus - local_sgs.idle_cpus <= imbalance) + return NULL; + } + diff --git a/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch b/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch new file mode 100644 index 0000000..bd9ae04 --- /dev/null +++ b/patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch @@ -0,0 +1,52 @@ +From: Mel Gorman +Date: Wed, 23 Sep 2020 16:59:29 +0100 +Subject: [PATCH] sched/numa: Check numa balancing information only when + enabled + +References: bnc#1176588 +Patch-mainline: Not yet, needs to be posted with the follow-on patch + +When selecting the first CPU for a task to run, the NUMA balancing +information will be used if it is available. Avoid making the +checks if NUMA balancing is disabled. This is of marginal interest, +just popped out while preparing the second patch. + +Signed-off-by: Mel Gorman +--- + kernel/sched/fair.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 327e3325c396..db02212b2fba 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8773,17 +8773,19 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + case group_has_spare: + if (sd->flags & SD_NUMA) { + #ifdef CONFIG_NUMA_BALANCING +- int idlest_cpu; +- /* +- * If there is spare capacity at NUMA, try to select +- * the preferred node +- */ +- if (cpu_to_node(this_cpu) == p->numa_preferred_nid) +- return NULL; ++ if (static_branch_likely(&sched_numa_balancing)) { ++ int idlest_cpu; ++ /* ++ * If there is spare capacity at NUMA, try to select ++ * the preferred node ++ */ ++ if (cpu_to_node(this_cpu) == p->numa_preferred_nid) ++ return NULL; + +- idlest_cpu = cpumask_first(sched_group_span(idlest)); +- if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) +- return idlest; ++ idlest_cpu = cpumask_first(sched_group_span(idlest)); ++ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) ++ return idlest; ++ } + #endif + /* + * Otherwise, keep the task on this node to stay close diff --git a/series.conf b/series.conf index 15de97a..4fa2dea 100644 --- a/series.conf +++ b/series.conf @@ -15133,6 +15133,12 @@ patches.suse/sched-fair-Clear-SMT-siblings-after-determining-the-core-is-not-idle.patch patches.suse/sched-nohz-Avoid-disabling-the-tick-for-very-short-durations.patch + # bnc#1176588 + patches.suse/sched-numa-Check-numa-balancing-information-only-when-enabled.patch + patches.suse/sched-numa-Avoid-creating-large-imbalances-at-task-creation-time.patch + patches.suse/cpuidle-Poll-for-a-minimum-of-30ns-and-poll-for-a-tick-if-lower-c-states-are-disabled.patch + patches.suse/locking-rwsem-Disable-reader-optimistic-spinning.patch + ######################################################## # Memory management ########################################################