Michal Suchanek 1e43cf
From 580c785ca3c60509d9e489a3d71deacb068973f0 Mon Sep 17 00:00:00 2001
Michal Suchanek 1e43cf
From: Valentin Schneider <valentin.schneider@arm.com>
Michal Suchanek 1e43cf
Date: Wed, 18 Aug 2021 13:13:33 +0530
Michal Suchanek 1e43cf
Subject: [PATCH] sched/topology: Skip updating masks for non-online nodes
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
Patch-mainline: v5.15-rc1
Michal Suchanek 1e43cf
References: bsc#1197446 ltc#183000
Michal Suchanek 1e43cf
Git-commit: 0083242c93759dde353a963a90cb351c5c283379
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
The scheduler currently expects NUMA node distances to be stable from
Michal Suchanek 1e43cf
init onwards, and as a consequence builds the related data structures
Michal Suchanek 1e43cf
once-and-for-all at init (see sched_init_numa()).
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
Unfortunately, on some architectures node distance is unreliable for
Michal Suchanek 1e43cf
offline nodes and may very well change upon onlining.
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
Skip over offline nodes during sched_init_numa(). Track nodes that have
Michal Suchanek 1e43cf
been onlined at least once, and trigger a build of a node's NUMA masks
Michal Suchanek 1e43cf
when it is first onlined post-init.
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com>
Michal Suchanek 1e43cf
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Michal Suchanek 1e43cf
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Michal Suchanek 1e43cf
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Michal Suchanek 1e43cf
Link: https://lkml.kernel.org/r/20210818074333.48645-1-srikar@linux.vnet.ibm.com
Michal Suchanek 1e43cf
Signed-off-by: Mel Gorman <mgorman@suse.de>
Michal Suchanek 1e43cf
---
Michal Suchanek 1e43cf
 kernel/sched/topology.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
Michal Suchanek 1e43cf
 1 file changed, 65 insertions(+)
Michal Suchanek 1e43cf
Michal Suchanek 1e43cf
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
Michal Suchanek 1e43cf
--- a/kernel/sched/topology.c
Michal Suchanek 1e43cf
+++ b/kernel/sched/topology.c
Michal Suchanek 1e43cf
@@ -1290,6 +1290,8 @@ int				sched_max_numa_distance;
Michal Suchanek 1e43cf
 static int			*sched_domains_numa_distance;
Michal Suchanek 1e43cf
 static struct cpumask		***sched_domains_numa_masks;
Michal Suchanek 1e43cf
 int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
Michal Suchanek 1e43cf
 #endif
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
 /*
Michal Suchanek 1e43cf
@@ -1655,6 +1657,16 @@ void sched_init_numa(void)
Michal Suchanek 1e43cf
 			sched_domains_numa_masks[i][j] = mask;
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
 			for_each_node(k) {
Michal Suchanek 1e43cf
+				/*
Michal Suchanek 1e43cf
+				 * Distance information can be unreliable for
Michal Suchanek 1e43cf
+				 * offline nodes, defer building the node
Michal Suchanek 1e43cf
+				 * masks to its bringup.
Michal Suchanek 1e43cf
+				 * This relies on all unique distance values
Michal Suchanek 1e43cf
+				 * still being visible at init time.
Michal Suchanek 1e43cf
+				 */
Michal Suchanek 1e43cf
+				if (!node_online(j))
Michal Suchanek 1e43cf
+					continue;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
 				if (node_distance(j, k) > sched_domains_numa_distance[i])
Michal Suchanek 1e43cf
 					continue;
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
@@ -1705,6 +1717,53 @@ void sched_init_numa(void)
Michal Suchanek 1e43cf
 	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
 	init_numa_topology_type();
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
Michal Suchanek 1e43cf
+	if (!sched_numa_onlined_nodes)
Michal Suchanek 1e43cf
+		return;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
Michal Suchanek 1e43cf
+	for_each_online_node(i)
Michal Suchanek 1e43cf
+		bitmap_set(sched_numa_onlined_nodes, i, 1);
Michal Suchanek 1e43cf
+}
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+static void __sched_domains_numa_masks_set(unsigned int node)
Michal Suchanek 1e43cf
+{
Michal Suchanek 1e43cf
+	int i, j;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	/*
Michal Suchanek 1e43cf
+	 * NUMA masks are not built for offline nodes in sched_init_numa().
Michal Suchanek 1e43cf
+	 * Thus, when a CPU of a never-onlined-before node gets plugged in,
Michal Suchanek 1e43cf
+	 * adding that new CPU to the right NUMA masks is not sufficient: the
Michal Suchanek 1e43cf
+	 * masks of that CPU's node must also be updated.
Michal Suchanek 1e43cf
+	 */
Michal Suchanek 1e43cf
+	if (test_bit(node, sched_numa_onlined_nodes))
Michal Suchanek 1e43cf
+		return;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	bitmap_set(sched_numa_onlined_nodes, node, 1);
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	for (i = 0; i < sched_domains_numa_levels; i++) {
Michal Suchanek 1e43cf
+		for (j = 0; j < nr_node_ids; j++) {
Michal Suchanek 1e43cf
+			if (!node_online(j) || node == j)
Michal Suchanek 1e43cf
+				continue;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+			if (node_distance(j, node) > sched_domains_numa_distance[i])
Michal Suchanek 1e43cf
+				continue;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+			/* Add remote nodes in our masks */
Michal Suchanek 1e43cf
+			cpumask_or(sched_domains_numa_masks[i][node],
Michal Suchanek 1e43cf
+				   sched_domains_numa_masks[i][node],
Michal Suchanek 1e43cf
+				   sched_domains_numa_masks[0][j]);
Michal Suchanek 1e43cf
+		}
Michal Suchanek 1e43cf
+	}
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+	/*
Michal Suchanek 1e43cf
+	 * A new node has been brought up, potentially changing the topology
Michal Suchanek 1e43cf
+	 * classification.
Michal Suchanek 1e43cf
+	 *
Michal Suchanek 1e43cf
+	 * Note that this is racy vs any use of sched_numa_topology_type :/
Michal Suchanek 1e43cf
+	 */
Michal Suchanek 1e43cf
+	init_numa_topology_type();
Michal Suchanek 1e43cf
 }
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
 void sched_domains_numa_masks_set(unsigned int cpu)
Michal Suchanek 1e43cf
@@ -1712,8 +1771,14 @@ void sched_domains_numa_masks_set(unsign
Michal Suchanek 1e43cf
 	int node = cpu_to_node(cpu);
Michal Suchanek 1e43cf
 	int i, j;
Michal Suchanek 1e43cf
 
Michal Suchanek 1e43cf
+	__sched_domains_numa_masks_set(node);
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
 	for (i = 0; i < sched_domains_numa_levels; i++) {
Michal Suchanek 1e43cf
 		for (j = 0; j < nr_node_ids; j++) {
Michal Suchanek 1e43cf
+			if (!node_online(j))
Michal Suchanek 1e43cf
+				continue;
Michal Suchanek 1e43cf
+
Michal Suchanek 1e43cf
+			/* Set ourselves in the remote node's masks */
Michal Suchanek 1e43cf
 			if (node_distance(j, node) <= sched_domains_numa_distance[i])
Michal Suchanek 1e43cf
 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
Michal Suchanek 1e43cf
 		}