From 159f2d379e2e4586a7945d54584b200289cad655 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 10 Feb 2022 12:08:23 +0000
Subject: [PATCH] cpuidle: menu: Bias selection of a shallower c-state when CPU
idles for IO
References: bnc#1193353
Patch-mainline: Never, likely to be rejected for power vs performance
A basic I/O test with FIO doing large random reads indicated there
was a regression relative to older kernels. The test parameters were
fio --direct=0 --ioengine=sync --thread --directory=/mnt --invalidate=1
--group_reporting=1 --runtime=300 --fallocate=posix --ramp_time=10
--name=RandomReads-128000-32k-4 --new_group --rw=randread
--size=32000m --numjobs=4 --bs=32k
--filename_format=FioWorkloads.\$jobnum
Part of the problem is that CPUs fio is running on select the deepest
C-state for short durations. The predicted time to wakeups is not scaled
to the number of IO waiters like interactivity_req and latency_req and
the predictions often suggest an expected idle time far past the next
tick. While this adjusts, it can take a long time as not all idling will
update predictions due to polling or switching.
This patch first avoids disabling the tick if a task is sleeping for IO.
The minimum of the time to either the next tick or the expected scheduling
event is used to select a C-state.
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
drivers/cpuidle/governors/menu.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 2e5670446991..944c34f093d0 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -397,14 +397,28 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
idx = 0; /* No states enabled. Must use 0. */
/*
- * Don't stop the tick if the selected state is a polling one or if the
- * expected idle duration is shorter than the tick period length.
+ * Don't stop the tick if the selected state is a polling one, if the
+ * expected idle duration is shorter than the tick period length or
+ * there is an io-waiter that may receive an interrupt soon.
*/
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
- predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
+ predicted_ns < TICK_NSEC || nr_iowaiters) && !tick_nohz_tick_stopped()) {
+ s64 threshold = ktime_to_ns(delta_tick);
+
*stop_tick = false;
- if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
+ /*
+ * For io-waiters, use either the soonest of either the next
+ * timer event or the predicted next wakeup event adjusted for
+ * the number of io-waiters. At worst, a shallow c-state will
+ * be used for too long as the IO takes longer than predicted
+ * to complete but the nr_iowaiters value should not be lost as
+ * it's tracked by the core scheduler.
+ */
+ if (nr_iowaiters)
+ threshold = min(threshold, latency_req);
+
+ if (idx > 0 && drv->states[idx].target_residency_ns > threshold) {
/*
* The tick is not going to be stopped and the target
* residency of the state to be returned is not within
@@ -416,7 +430,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
continue;
idx = i;
- if (drv->states[i].target_residency_ns <= delta_tick)
+ if (drv->states[i].target_residency_ns <= threshold)
break;
}
}