Blob Blame History Raw
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Subject: s390/topology: alternative topology for topology-less machines
Patch-mainline: v4.14-rc2
Git-commit: 1b25fda0533462c9cee3a22e8a7bea68fa670af2
References: bnc#1066983, LTC#159177

Description:  kernel: alternative topology
Symptom:      Overall performance is lower than expected.
Problem:      When running within a z/VM guest the Linux scheduling domains
              are configured in such a way that each CPU is far away from
              all other CPUs. This way the kernel tries to avoid to send
              expensive inter processor interrupts to other CPUs. As a
              downside sometimes this also leads to the situation where
              idle CPUs won't be woken up even if there is a runable process
              waiting for execution. It depends on the workload if this
              behavior is good or not.
Solution:     Add a sysctl file /proc/sys/s390/topology which allows to
              change the default configuration. When running within a
              z/VM guest writing 1 to the file will result in a different
              topology where all CPUs are configured to be close to each
              other. This may increase overall performance depending on
              the workload.
              The used default topology within a z/VM guest can also be
              changed with the kernel command line parameter 'topology=on'.
Reproduction: -

Upstream-Description:

              s390/topology: alternative topology for topology-less machines

              If running on machines that do not provide topology information we
              currently generate a "fake" topology which defines the maximum
              distance between each cpu: each cpu will be put into an own drawer.

              Historically this used to be the best option for (virtual) machines in
              overcommited hypervisors.

              For some workloads however it is better to generate a different
              topology where all cpus are siblings within a package (all cpus are
              core siblings). This shows performance improvements of up to 10%,
              depending on the workload.

              In order to keep the current behaviour, but also allow to switch to
              the different core sibling topology use the existing "topology="
              kernel parameter:

              Specifying "topology=on" on machines without topology information will
              generate the core siblings (fake) topology information, instead of the
              default topology information where all cpus have the maximum distance.

              On machines which provide topology information specifying
              "topology=on" does not have any effect.

              Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
              Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>


Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Hannes Reinecke <hare@suse.com>
---
 arch/s390/kernel/early.c    |   12 -------
 arch/s390/kernel/topology.c |   72 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 23 deletions(-)

--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -397,18 +397,6 @@ static inline void save_vector_registers
 #endif
 }
 
-static int __init topology_setup(char *str)
-{
-	bool enabled;
-	int rc;
-
-	rc = kstrtobool(str, &enabled);
-	if (!rc && !enabled)
-		S390_lowcore.machine_flags &= ~MACHINE_FLAG_TOPOLOGY;
-	return rc;
-}
-early_param("topology", topology_setup);
-
 static int __init disable_vector_extension(char *str)
 {
 	S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -29,12 +29,20 @@
 #define PTF_VERTICAL	(1UL)
 #define PTF_CHECK	(2UL)
 
+enum {
+	TOPOLOGY_MODE_HW,
+	TOPOLOGY_MODE_SINGLE,
+	TOPOLOGY_MODE_PACKAGE,
+	TOPOLOGY_MODE_UNINITIALIZED
+};
+
 struct mask_info {
 	struct mask_info *next;
 	unsigned char id;
 	cpumask_t mask;
 };
 
+static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED;
 static void set_topology_timer(void);
 static void topology_work_fn(struct work_struct *work);
 static struct sysinfo_15_1_x *tl_info;
@@ -59,11 +67,26 @@ static cpumask_t cpu_group_map(struct ma
 	cpumask_t mask;
 
 	cpumask_copy(&mask, cpumask_of(cpu));
-	if (!MACHINE_HAS_TOPOLOGY)
-		return mask;
-	for (; info; info = info->next) {
-		if (cpumask_test_cpu(cpu, &info->mask))
-			return info->mask;
+	switch (topology_mode) {
+	case TOPOLOGY_MODE_HW:
+		while (info) {
+			if (cpumask_test_cpu(cpu, &info->mask)) {
+				mask = info->mask;
+				break;
+			}
+			info = info->next;
+		}
+		if (cpumask_empty(&mask))
+			cpumask_copy(&mask, cpumask_of(cpu));
+		break;
+	case TOPOLOGY_MODE_PACKAGE:
+		cpumask_copy(&mask, cpu_present_mask);
+		break;
+	default:
+		/* fallthrough */
+	case TOPOLOGY_MODE_SINGLE:
+		cpumask_copy(&mask, cpumask_of(cpu));
+		break;
 	}
 	return mask;
 }
@@ -74,7 +97,7 @@ static cpumask_t cpu_thread_map(unsigned
 	int i;
 
 	cpumask_copy(&mask, cpumask_of(cpu));
-	if (!MACHINE_HAS_TOPOLOGY)
+	if (topology_mode != TOPOLOGY_MODE_HW)
 		return mask;
 	cpu -= cpu % (smp_cpu_mtid + 1);
 	for (i = 0; i <= smp_cpu_mtid; i++)
@@ -223,7 +246,7 @@ int topology_set_cpu_management(int fc)
 static void update_cpu_masks(void)
 {
 	struct cpu_topology_s390 *topo;
-	int cpu;
+	int cpu, id;
 
 	for_each_possible_cpu(cpu) {
 		topo = &cpu_topology[cpu];
@@ -231,12 +254,13 @@ static void update_cpu_masks(void)
 		topo->core_mask = cpu_group_map(&socket_info, cpu);
 		topo->book_mask = cpu_group_map(&book_info, cpu);
 		topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
-		if (!MACHINE_HAS_TOPOLOGY) {
+		if (topology_mode != TOPOLOGY_MODE_HW) {
+			id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu;
 			topo->thread_id = cpu;
 			topo->core_id = cpu;
-			topo->socket_id = cpu;
-			topo->book_id = cpu;
-			topo->drawer_id = cpu;
+			topo->socket_id = id;
+			topo->book_id = id;
+			topo->drawer_id = id;
 			if (cpu_present(cpu))
 				cpumask_set_cpu(cpu, &cpus_with_topology);
 		}
@@ -459,6 +483,12 @@ void __init topology_init_early(void)
 	struct sysinfo_15_1_x *info;
 
 	set_sched_topology(s390_topology);
+	if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) {
+		if (MACHINE_HAS_TOPOLOGY)
+			topology_mode = TOPOLOGY_MODE_HW;
+		else
+			topology_mode = TOPOLOGY_MODE_SINGLE;
+	}
 	if (!MACHINE_HAS_TOPOLOGY)
 		goto out;
 	tl_info = memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE);
@@ -474,6 +504,26 @@ out:
 	__arch_update_cpu_topology();
 }
 
+static inline int topology_get_mode(int enabled)
+{
+	if (!enabled)
+		return TOPOLOGY_MODE_SINGLE;
+	return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE;
+}
+
+static int __init topology_setup(char *str)
+{
+	bool enabled;
+	int rc;
+
+	rc = kstrtobool(str, &enabled);
+	if (rc)
+		return rc;
+	topology_mode = topology_get_mode(enabled);
+	return 0;
+}
+early_param("topology", topology_setup);
+
 static int __init topology_init(void)
 {
 	if (MACHINE_HAS_TOPOLOGY)