Blob Blame History Raw
From: Thomas Renninger <trenn@suse.com>
Date: Wed, 13 Sep 2017 20:31:19 +0100
Subject: [PATCH] cpufreq, intel_pstate: Use setpoint of 40% on servers

Patch-mainline: Never, upstream is trying to integrate scheduler/pm instead
References: bnc#945201

Changes in power idling in 3.13 and 3.14 caused major regressions in number
of basic workloads that idle for short periods of time for either IO or
because they are client/server workloads. Upstream resists any attempt to
fix this on the basis that power consumption is higher if the CPU does not
idle very quickly. Their focus has been on having the cpufreq driver and
scheduler co-operate closely but it's months away from making any sort of
progress and in the meantime, server performance is severely impacted. This
patch boosts performance of dbench, pgbench and sysbench-oltp workloads
to be equivalent to SLE 12 SP1 performance.

Depending on ACPI FADT table's preferred PM profile (compare with ACPI
spec chapter 5.2.9.1 Preferred PM Profile System Types) the intel_pstate
performance tunables will be set to a more performance oriented policy.

intel_pstate=vanilla_policy
boot parameter will disable this functionality again.

intel_pstate=server_policy
will apply the performance optimized values also on laptops, desktops
or where the ACPI preferred PM profile value is not set.

hardy3 was used for evaluation. Expectation is that an impact is felt only
at relatively low utilisation or thread counts depending on the scenario

siege transactions
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Hmean     1       4065.59 (   0.00%)     6045.11 (  48.69%)
Hmean     2       6672.53 (   0.00%)     8742.52 (  31.02%)
Hmean     4      14952.61 (   0.00%)    18843.06 (  26.02%)
Hmean     8      35775.77 (   0.00%)    39019.54 (   9.07%)
Hmean     16     67879.18 (   0.00%)    68383.78 (   0.74%)
Hmean     32     98246.31 (   0.00%)    98890.12 (   0.66%)
Hmean     64     94631.68 (   0.00%)    95715.03 (   1.14%)
Hmean     128    94287.33 (   0.00%)    94376.52 (   0.09%)
Hmean     250    86177.15 (   0.00%)    86641.38 (   0.54%)

pgbench Transactions
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Hmean     1      13345.48 (   0.00%)    13945.89 (   4.50%)
Hmean     6      33028.99 (   0.00%)    41881.80 (  26.80%)
Hmean     12     51733.04 (   0.00%)    72846.06 (  40.81%)
Hmean     22    110182.02 (   0.00%)   126467.43 (  14.78%)
Hmean     30    127732.32 (   0.00%)   127494.44 (  -0.19%)
Hmean     48    127098.19 (   0.00%)   128968.20 (   1.47%)
Hmean     80    132963.83 (   0.00%)   126711.32 (  -4.70%)
Hmean     110   124245.25 (   0.00%)   129194.30 (   3.98%)
Hmean     142   126212.37 (   0.00%)   125594.58 (  -0.49%)
Hmean     160   135786.02 (   0.00%)   147557.93 (   8.67%)

dbench4 Loadfile Execution Time
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Amean     1         52.24 (   0.00%)       42.69 (  18.29%)
Amean     2         50.70 (   0.00%)       36.09 (  28.82%)
Amean     4         45.39 (   0.00%)       35.15 (  22.56%)
Amean     8         56.35 (   0.00%)       53.75 (   4.62%)
Amean     16        98.32 (   0.00%)       93.80 (   4.60%)
Amean     32       187.72 (   0.00%)      181.48 (   3.33%)
Amean     64       599.68 (   0.00%)      509.57 (  15.03%)
Amean     128     2768.50 (   0.00%)     2671.00 (   3.52%)

Small boosts are also observed for sockperf and netperf although utilisation
is low enough there that it's not as reliable.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 drivers/cpufreq/intel_pstate.c | 43 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0d82f3302df6..0c6997aa99f6 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2384,6 +2384,8 @@ static int intel_pstate_update_status(const char *buf, size_t size)
 static int no_load __initdata;
 static int no_hwp __initdata;
 static int hwp_only __initdata;
+static int __initdata vanilla_policy;
+static int __initdata server_policy;
 static unsigned int force_load __initdata;
 
 static int __init intel_pstate_msrs_not_valid(void)
@@ -2564,6 +2566,9 @@ static const struct x86_cpu_id hwp_support_ids[] __initconst = {
 static int __init intel_pstate_init(void)
 {
 	int rc;
+#if IS_ENABLED(CONFIG_ACPI)
+	const char *profile = NULL;
+#endif
 
 	if (no_load)
 		return -ENODEV;
@@ -2604,6 +2609,40 @@ static int __init intel_pstate_init(void)
 
 	pr_info("Intel P-state driver initializing\n");
 
+#if IS_ENABLED(CONFIG_ACPI)
+	if (!vanilla_policy) {
+		switch (acpi_gbl_FADT.preferred_profile) {
+		case PM_WORKSTATION:
+			profile = "Workstation";
+			break;
+		case PM_ENTERPRISE_SERVER:
+			profile = "Enterprise Server";
+			break;
+		case PM_SOHO_SERVER:
+			profile = "SOHO Server";
+			break;
+		case PM_PERFORMANCE_SERVER:
+			profile = "Performance Server";
+			break;
+		default:
+			if (server_policy)
+				profile = "Server";
+		};
+
+		if (profile) {
+			pr_info("Intel P-state setting %s policy\n", profile);
+
+			/*
+			 * setpoint based on observations that siege maxes out
+			 * due to internal mutex usage at roughly an average of
+			 * 50% set use a setpoint of 40% to boost the frequency
+			 * enough to perform reasonably.
+			 */
+			pid_params.setpoint = 40;
+		}
+	}
+#endif
+
 	all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
 	if (!all_cpu_data)
 		return -ENOMEM;
@@ -2645,6 +2684,10 @@ static int __init intel_pstate_setup(char *str)
 		force_load = 1;
 	if (!strcmp(str, "hwp_only"))
 		hwp_only = 1;
+	if (!strcmp(str, "vanilla_policy"))
+		vanilla_policy = 1;
+	if (!strcmp(str, "server_policy"))
+		server_policy = 1;
 	if (!strcmp(str, "per_cpu_perf_limits"))
 		per_cpu_limits = true;