Blob Blame History Raw
From 75e83ec36bfc93bdafaf9dda2262b7ff006781cf Mon Sep 17 00:00:00 2001
From: Matt Fleming <mfleming@suse.de>
Date: Fri, 24 May 2019 22:11:42 +0100
Subject: [PATCH] sched/topology: Improve load balancing on AMD EPYC
Patch-mainline: Not yet, under discussion on LKML
References: bsc#1137366

SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
for any sched domains with a NUMA distance greater than 2 hops
(RECLAIM_DISTANCE). The idea being that it's expensive to balance
across domains that far apart.

However, as is rather unfortunately explained in

  commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30")

the value for RECLAIM_DISTANCE is based on node distance tables from
2011-era hardware.

Current AMD EPYC machines have the following NUMA node distances:

node distances:
node   0   1   2   3   4   5   6   7
  0:  10  16  16  16  32  32  32  32
  1:  16  10  16  16  32  32  32  32
  2:  16  16  10  16  32  32  32  32
  3:  16  16  16  10  32  32  32  32
  4:  32  32  32  32  10  16  16  16
  5:  32  32  32  32  16  10  16  16
  6:  32  32  32  32  16  16  10  16
  7:  32  32  32  32  16  16  16  10

where 2 hops is 32.

The result is that the scheduler fails to load balance properly across
NUMA nodes on different sockets -- 2 hops apart.

For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
(CPUs 32-39) like so,

  $ numactl -C 0-7,32-39 ./spinner 16

causes all threads to fork and remain on node 0 until the active
balancer kicks in after a few seconds and forcibly moves some threads
to node 4.

Override node_reclaim_distance for AMD Zen.

Signed-off-by: Matt Fleming <mfleming@suse.de>
---
 arch/x86/kernel/cpu/amd.c |    5 +++++
 include/linux/topology.h  |    3 +++
 kernel/sched/topology.c   |    3 ++-
 mm/khugepaged.c           |    2 +-
 mm/page_alloc.c           |    2 +-
 5 files changed, 12 insertions(+), 3 deletions(-)

--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/random.h>
+#include <linux/topology.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cacheinfo.h>
@@ -905,6 +906,10 @@ static void init_amd_zn(struct cpuinfo_x
 		if (!cpu_has(c, X86_FEATURE_CPB))
 			set_cpu_cap(c, X86_FEATURE_CPB);
 
+#ifdef CONFIG_NUMA
+	node_reclaim_distance = 32;
+#endif
+
 		/*
 		 * Zen3 (Fam19 model < 0x10) parts are not susceptible to
 		 * Branch Type Confusion, but predate the allocation of the
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -59,6 +59,9 @@ int arch_update_cpu_topology(void);
  */
 #define RECLAIM_DISTANCE 30
 #endif
+
+extern int __read_mostly node_reclaim_distance;
+
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS	(1)
 #endif
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1070,6 +1070,7 @@ static int *sched_domains_numa_distance;
 int sched_max_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
 #endif
 
 /*
@@ -1191,7 +1192,7 @@ sd_init(struct sched_domain_topology_lev
 		sd->idle_idx = 2;
 
 		sd->flags |= SD_SERIALIZE;
-		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+		if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
 			sd->flags &= ~(SD_BALANCE_EXEC |
 				       SD_BALANCE_FORK |
 				       SD_WAKE_AFFINE);
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -690,7 +690,7 @@ static bool khugepaged_scan_abort(int ni
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (!khugepaged_node_load[i])
 			continue;
-		if (node_distance(nid, i) > RECLAIM_DISTANCE)
+		if (node_distance(nid, i) > node_reclaim_distance)
 			return true;
 	}
 	return false;
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3238,7 +3238,7 @@ bool zone_watermark_ok_safe(struct zone
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
-				RECLAIM_DISTANCE;
+				node_reclaim_distance;
 }
 #else	/* CONFIG_NUMA */
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)