From 75e83ec36bfc93bdafaf9dda2262b7ff006781cf Mon Sep 17 00:00:00 2001
From: Matt Fleming <mfleming@suse.de>
Date: Fri, 24 May 2019 22:11:42 +0100
Subject: [PATCH] sched/topology: Improve load balancing on AMD EPYC
Patch-mainline: Not yet, under discussion on LKML
References: bsc#1137366
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
for any sched domains with a NUMA distance greater than 2 hops
(RECLAIM_DISTANCE). The idea being that it's expensive to balance
across domains that far apart.
However, as is rather unfortunately explained in
commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30")
the value for RECLAIM_DISTANCE is based on node distance tables from
2011-era hardware.
Current AMD EPYC machines have the following NUMA node distances:
node distances:
node 0 1 2 3 4 5 6 7
0: 10 16 16 16 32 32 32 32
1: 16 10 16 16 32 32 32 32
2: 16 16 10 16 32 32 32 32
3: 16 16 16 10 32 32 32 32
4: 32 32 32 32 10 16 16 16
5: 32 32 32 32 16 10 16 16
6: 32 32 32 32 16 16 10 16
7: 32 32 32 32 16 16 16 10
where 2 hops is 32.
The result is that the scheduler fails to load balance properly across
NUMA nodes on different sockets -- 2 hops apart.
For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
(CPUs 32-39) like so,
$ numactl -C 0-7,32-39 ./spinner 16
causes all threads to fork and remain on node 0 until the active
balancer kicks in after a few seconds and forcibly moves some threads
to node 4.
Override node_reclaim_distance for AMD Zen.
Signed-off-by: Matt Fleming <mfleming@suse.de>
---
arch/x86/kernel/cpu/amd.c | 5 +++++
include/linux/topology.h | 3 +++
kernel/sched/topology.c | 3 ++-
mm/khugepaged.c | 2 +-
mm/page_alloc.c | 2 +-
5 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 57bb2100e05b..bb2f3e98efbf 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/random.h>
+#include <linux/topology.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -812,6 +813,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
+#endif
+
/*
* Fix erratum 1076: CPB feature bit not being set in CPUID.
* Always set it, except when running under a hypervisor.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e1ee4b..74b484354ac9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -59,6 +59,9 @@ int arch_update_cpu_topology(void);
*/
#define RECLAIM_DISTANCE 30
#endif
+
+extern int __read_mostly node_reclaim_distance;
+
#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8b646058fb57..57b4afe6387a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1070,6 +1070,7 @@ static int *sched_domains_numa_distance;
int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1191,7 +1192,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->idle_idx = 2;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2c2813d90cb2..859f6fd0cb84 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -690,7 +690,7 @@ static bool khugepaged_scan_abort(int nid)
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b8ba38dc77f4..0c8b489c0f0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3191,7 +3191,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
--
2.13.7