Blob Blame History Raw
From 41354d1f124ed4917a563b0d209c2e4711621268 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.com>
Date: Wed, 13 Sep 2017 20:31:19 +0100
Subject: [PATCH] cpufreq, intel_pstate: Use setpoint of 10% on servers

Patch-mainline: Never, upstream is trying to integrate scheduler/pm instead
References: bnc#945201,bnc#1064414

Changes in power idling in 3.13 and 3.14 caused major regressions in number
of basic workloads that idle for short periods of time for either IO or
because they are client/server workloads. Upstream resists any attempt to
fix this on the basis that power consumption is higher if the CPU does not
idle very quickly. Their focus has been on having the cpufreq driver and
scheduler co-operate closely but it's months away from making any sort of
progress and in the meantime, server performance is severely impacted. This
patch boosts performance of dbench, pgbench and sysbench-oltp workloads
to be equivalent to SLE 12 SP1 performance.

Depending on ACPI FADT table's preferred PM profile (compare with ACPI
spec chapter 5.2.9.1 Preferred PM Profile System Types) the intel_pstate
performance tunables will be set to a more performance oriented policy.

intel_pstate=vanilla_policy
boot parameter will disable this functionality again.

intel_pstate=server_policy
will apply the performance optimized values also on laptops, desktops
or where the ACPI preferred PM profile value is not set.

hardy3 was used for evaluation. Expectation is that an impact is felt only
at relatively low utilisation or thread counts depending on the scenario

siege transactions
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Hmean     1       4065.59 (   0.00%)     6045.11 (  48.69%)
Hmean     2       6672.53 (   0.00%)     8742.52 (  31.02%)
Hmean     4      14952.61 (   0.00%)    18843.06 (  26.02%)
Hmean     8      35775.77 (   0.00%)    39019.54 (   9.07%)
Hmean     16     67879.18 (   0.00%)    68383.78 (   0.74%)
Hmean     32     98246.31 (   0.00%)    98890.12 (   0.66%)
Hmean     64     94631.68 (   0.00%)    95715.03 (   1.14%)
Hmean     128    94287.33 (   0.00%)    94376.52 (   0.09%)
Hmean     250    86177.15 (   0.00%)    86641.38 (   0.54%)

pgbench Transactions
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Hmean     1      13345.48 (   0.00%)    13945.89 (   4.50%)
Hmean     6      33028.99 (   0.00%)    41881.80 (  26.80%)
Hmean     12     51733.04 (   0.00%)    72846.06 (  40.81%)
Hmean     22    110182.02 (   0.00%)   126467.43 (  14.78%)
Hmean     30    127732.32 (   0.00%)   127494.44 (  -0.19%)
Hmean     48    127098.19 (   0.00%)   128968.20 (   1.47%)
Hmean     80    132963.83 (   0.00%)   126711.32 (  -4.70%)
Hmean     110   124245.25 (   0.00%)   129194.30 (   3.98%)
Hmean     142   126212.37 (   0.00%)   125594.58 (  -0.49%)
Hmean     160   135786.02 (   0.00%)   147557.93 (   8.67%)

dbench4 Loadfile Execution Time
                             4.12.11                4.12.11
                      sle15-20170913      intel_pstate-v1r2
Amean     1         52.24 (   0.00%)       42.69 (  18.29%)
Amean     2         50.70 (   0.00%)       36.09 (  28.82%)
Amean     4         45.39 (   0.00%)       35.15 (  22.56%)
Amean     8         56.35 (   0.00%)       53.75 (   4.62%)
Amean     16        98.32 (   0.00%)       93.80 (   4.60%)
Amean     32       187.72 (   0.00%)      181.48 (   3.33%)
Amean     64       599.68 (   0.00%)      509.57 (  15.03%)
Amean     128     2768.50 (   0.00%)     2671.00 (   3.52%)

Small boosts are also observed for sockperf and netperf although utilisation
is low enough there that it's not as reliable.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 drivers/cpufreq/intel_pstate.c |   45 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -43,6 +43,8 @@
 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
 #define INTEL_CPUFREQ_TRANSITION_DELAY		500
 
+#define CPUFREQ_SERVER_DEFAULT_SETPOINT		10
+
 #ifdef CONFIG_ACPI
 #include <acpi/processor.h>
 #include <acpi/cppc_acpi.h>
@@ -2553,6 +2555,8 @@ static int intel_pstate_update_status(co
 static int no_load __initdata;
 static int no_hwp __initdata;
 static int hwp_only __initdata;
+static int __initdata vanilla_policy;
+static int __initdata server_policy;
 static unsigned int force_load __initdata;
 
 static int __init intel_pstate_msrs_not_valid(void)
@@ -2727,6 +2731,9 @@ static int __init intel_pstate_init(void
 {
 	const struct x86_cpu_id *id;
 	int rc;
+#if IS_ENABLED(CONFIG_ACPI)
+	const char *profile = NULL;
+#endif
 
 	if (no_load)
 		return -ENODEV;
@@ -2766,6 +2773,40 @@ hwp_cpu_matched:
 
 	pr_info("Intel P-state driver initializing\n");
 
+#if IS_ENABLED(CONFIG_ACPI)
+	if (!vanilla_policy) {
+		switch (acpi_gbl_FADT.preferred_profile) {
+		case PM_WORKSTATION:
+			profile = "Workstation";
+			break;
+		case PM_ENTERPRISE_SERVER:
+			profile = "Enterprise Server";
+			break;
+		case PM_SOHO_SERVER:
+			profile = "SOHO Server";
+			break;
+		case PM_PERFORMANCE_SERVER:
+			profile = "Performance Server";
+			break;
+		default:
+			if (server_policy)
+				profile = "Server";
+		};
+
+		if (profile) {
+			pr_info("Intel P-state setting %s policy\n", profile);
+
+			/*
+			 * setpoint based on observations that siege maxes out
+			 * due to internal mutex usage at roughly an average of
+			 * 50% set use a setpoint of 30% to boost the frequency
+			 * enough to perform reasonably.
+			 */
+			pid_params.setpoint = CPUFREQ_SERVER_DEFAULT_SETPOINT;
+		}
+	}
+#endif
+
 	all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
 	if (!all_cpu_data)
 		return -ENOMEM;
@@ -2807,6 +2848,10 @@ static int __init intel_pstate_setup(cha
 		force_load = 1;
 	if (!strcmp(str, "hwp_only"))
 		hwp_only = 1;
+	if (!strcmp(str, "vanilla_policy"))
+		vanilla_policy = 1;
+	if (!strcmp(str, "server_policy"))
+		server_policy = 1;
 	if (!strcmp(str, "per_cpu_perf_limits"))
 		per_cpu_limits = true;