Blob Blame History Raw
From 4087c5f1d2fc8be43e8eecc65c20d022cbaa872c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 3 Sep 2018 17:22:05 +0100
Subject: [PATCH] sched/numa: Stop comparing tasks for NUMA placement after
 selecting an idle core

References: bnc#1101669 optimise numa balancing for fast migrate
Patch-mainline: No, under review but tested by QA

task_numa_migrate is responsible for finding a core on a preferred NUMA
node for a task. As part of this, task_numa_find_cpu iterates through
the CPUs of a node and evaulates CPUs, both idle and with running tasks,
as placement candidates. Generally though, any idle CPU is equivalent in
terms of improving imbalances and a search after finding one is pointless.
This patch stops examining CPUs on a node if an idle CPU is considered
suitable.

While there are some workloads that show minor gains and losses, they are
mostly within the noise with the exception of specjbb whether running
as one large VM or one VM per socket. The following was reported on a
two-socket Haswell machine with 24 cores per socket.

specjbb, one JVM per socket (2 total)
                              4.19.0-rc1             4.19.0-rc1
                                 vanilla           oneselect-v1
Hmean     tput-1     42258.43 (   0.00%)    43692.10 (   3.39%)
Hmean     tput-2     87811.26 (   0.00%)    93719.52 (   6.73%)
Hmean     tput-3    138100.56 (   0.00%)   143484.08 (   3.90%)
Hmean     tput-4    181061.51 (   0.00%)   191292.99 (   5.65%)
Hmean     tput-5    225577.34 (   0.00%)   233439.58 (   3.49%)
Hmean     tput-6    264763.44 (   0.00%)   270634.50 (   2.22%)
Hmean     tput-7    301458.48 (   0.00%)   314133.32 (   4.20%)
Hmean     tput-8    348364.50 (   0.00%)   358445.76 (   2.89%)
Hmean     tput-9    382129.65 (   0.00%)   403288.75 (   5.54%)
Hmean     tput-10   403566.70 (   0.00%)   444592.51 (  10.17%)
Hmean     tput-11   456967.43 (   0.00%)   483300.45 (   5.76%)
Hmean     tput-12   502295.98 (   0.00%)   526281.53 (   4.78%)
Hmean     tput-13   441284.41 (   0.00%)   535507.75 (  21.35%)
Hmean     tput-14   461478.57 (   0.00%)   542068.97 (  17.46%)
Hmean     tput-15   489725.29 (   0.00%)   545033.17 (  11.29%)
Hmean     tput-16   503726.56 (   0.00%)   549738.23 (   9.13%)
Hmean     tput-17   528650.57 (   0.00%)   550849.00 (   4.20%)
Hmean     tput-18   518065.41 (   0.00%)   550018.29 (   6.17%)
Hmean     tput-19   527412.99 (   0.00%)   550652.26 (   4.41%)
Hmean     tput-20   528166.25 (   0.00%)   545783.85 (   3.34%)
Hmean     tput-21   524669.70 (   0.00%)   544848.37 (   3.85%)
Hmean     tput-22   519010.38 (   0.00%)   539603.70 (   3.97%)
Hmean     tput-23   514947.43 (   0.00%)   534714.32 (   3.84%)
Hmean     tput-24   517953.29 (   0.00%)   531783.24 (   2.67%)

Coeffecient of variance is roughly 0-3% depending on the wareshouse count
so these results are generally outside of the noise. Note that the biggest
improvements are when a socket would be roughly half loaded. It's not
especially obvious why this would be true given that without the patch the
socket is scanned anyway but it may be cache miss related. On a 2-socket
broadwell machine, the same observation was made in that the biggest benefit
was when a socket was half-loaded. If a single JVM is used for the entire
machine, the biggest benefit was also when the machine was half-utilised.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 kernel/sched/fair.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6374a8a2abc1..a2034ab28842 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1564,7 +1564,7 @@ static bool load_too_imbalanced(long src_load, long dst_load,
  * into account that it might be best if task running on the dst_cpu should
  * be exchanged with the source task
  */
-static void task_numa_compare(struct task_numa_env *env,
+static bool task_numa_compare(struct task_numa_env *env,
 			      long taskimp, long groupimp, bool maymove)
 {
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
@@ -1574,6 +1574,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	long imp = env->p->numa_group ? groupimp : taskimp;
 	long moveimp = imp;
 	int dist = env->dist;
+	bool dst_idle = false;
 
 	rcu_read_lock();
 	cur = task_rcu_dereference(&dst_rq->curr);
@@ -1667,11 +1668,13 @@ static void task_numa_compare(struct task_numa_env *env,
 		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
 						   env->dst_cpu);
 		local_irq_enable();
+		dst_idle = true;
 	}
 
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
+	return dst_idle;
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -1697,7 +1700,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 			continue;
 
 		env->dst_cpu = cpu;
-		task_numa_compare(env, taskimp, groupimp, maymove);
+		if (task_numa_compare(env, taskimp, groupimp, maymove))
+			break;
 	}
 }