Blob Blame History Raw
From 1ccf7a59558eed58195f5cf1e3fbc26ac27e1610 Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Fri, 22 May 2020 18:57:11 +0800
Subject: drm/amdgpu: added a sysfs interface for thermal throttling related V4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Git-commit: b265bdbd9f2811c2a674ea9d1161de821923897c
Patch-mainline: v5.9-rc1
References: jsc#SLE-12680, jsc#SLE-12880, jsc#SLE-12882, jsc#SLE-12883, jsc#SLE-13496, jsc#SLE-15322

User can check and set the enablement of throttling logging and
the interval between each logging.

V2: simplify the sysfs interface(no string parsing)
V3: add proper lock protection on updating throttling_logging_rs.interval
V4: documentation cosmetic per Luben's suggestion

Signed-off-by: Evan Quan <evan.quan@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c     | 68 ++++++++++++++++++++++
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c  | 10 ++--
 4 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cd913986863e..922d5bca14c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -993,6 +993,9 @@ struct amdgpu_device {
 	char				serial[16];
 
 	struct amdgpu_autodump		autodump;
+
+	atomic_t			throttling_logging_enabled;
+	struct ratelimit_state		throttling_logging_rs;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2f0e8da7bacf..e5b639a1575e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3035,6 +3035,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	adev->gfx.gfx_off_req_count = 1;
 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
 
+	atomic_set(&adev->throttling_logging_enabled, 1);
+	/*
+	 * If throttling continues, logging will be performed every minute
+	 * to avoid log flooding. "-1" is subtracted since the thermal
+	 * throttling interrupt comes every second. Thus, the total logging
+	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
+	 * for throttling interrupt) = 60 seconds.
+	 */
+	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
+	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
+
 	/* Registers mapping */
 	/* TODO: block userspace mapping of io register */
 	if (adev->asic_type >= CHIP_BONAIRE) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 02e6f8c4dde0..ca5f657d8b35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -1808,6 +1808,73 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
 	return 0;
 }
 
+/**
+ * DOC: thermal_throttling_logging
+ *
+ * Thermal throttling pulls down the clock frequency and thus the performance.
+ * It's an useful mechanism to protect the chip from overheating. Since it
+ * impacts performance, the user controls whether it is enabled and if so,
+ * the log frequency.
+ *
+ * Reading back the file shows you the status(enabled or disabled) and
+ * the interval(in seconds) between each thermal logging.
+ *
+ * Writing an integer to the file, sets a new logging interval, in seconds.
+ * The value should be between 1 and 3600. If the value is less than 1,
+ * thermal logging is disabled. Values greater than 3600 are ignored.
+ */
+static ssize_t amdgpu_get_thermal_throttling_logging(struct device *dev,
+						     struct device_attribute *attr,
+						     char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%s: thermal throttling logging %s, with interval %d seconds\n",
+			adev->ddev->unique,
+			atomic_read(&adev->throttling_logging_enabled) ? "enabled" : "disabled",
+			adev->throttling_logging_rs.interval / HZ + 1);
+}
+
+static ssize_t amdgpu_set_thermal_throttling_logging(struct device *dev,
+						     struct device_attribute *attr,
+						     const char *buf,
+						     size_t count)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+	long throttling_logging_interval;
+	unsigned long flags;
+	int ret = 0;
+
+	ret = kstrtol(buf, 0, &throttling_logging_interval);
+	if (ret)
+		return ret;
+
+	if (throttling_logging_interval > 3600)
+		return -EINVAL;
+
+	if (throttling_logging_interval > 0) {
+		raw_spin_lock_irqsave(&adev->throttling_logging_rs.lock, flags);
+		/*
+		 * Reset the ratelimit timer internals.
+		 * This can effectively restart the timer.
+		 */
+		adev->throttling_logging_rs.interval =
+			(throttling_logging_interval - 1) * HZ;
+		adev->throttling_logging_rs.begin = 0;
+		adev->throttling_logging_rs.printed = 0;
+		adev->throttling_logging_rs.missed = 0;
+		raw_spin_unlock_irqrestore(&adev->throttling_logging_rs.lock, flags);
+
+		atomic_set(&adev->throttling_logging_enabled, 1);
+	} else {
+		atomic_set(&adev->throttling_logging_enabled, 0);
+	}
+
+	return count;
+}
+
 static struct amdgpu_device_attr amdgpu_device_attrs[] = {
 	AMDGPU_DEVICE_ATTR_RW(power_dpm_state,				ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
 	AMDGPU_DEVICE_ATTR_RW(power_dpm_force_performance_level,	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
@@ -1830,6 +1897,7 @@ static struct amdgpu_device_attr amdgpu_device_attrs[] = {
 	AMDGPU_DEVICE_ATTR_RO(pcie_bw,					ATTR_FLAG_BASIC),
 	AMDGPU_DEVICE_ATTR_RW(pp_features,				ATTR_FLAG_BASIC),
 	AMDGPU_DEVICE_ATTR_RO(unique_id,				ATTR_FLAG_BASIC),
+	AMDGPU_DEVICE_ATTR_RW(thermal_throttling_logging,		ATTR_FLAG_BASIC),
 };
 
 static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_attr *attr,
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index 92cb7924524d..123a2af4be2e 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1533,11 +1533,6 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
 	 */
 	uint32_t ctxid = entry->src_data[0];
 	uint32_t data;
-	/*
-	 * if the throttling continues, the logging will be performed every
-	 * minute to avoid log flooding.
-	 */
-	static DEFINE_RATELIMIT_STATE(ratelimit_state, 60 * HZ, 1);
 
 	if (client_id == SOC15_IH_CLIENTID_THM) {
 		switch (src_id) {
@@ -1582,7 +1577,10 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
 				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
 				break;
 			case 0x7:
-				if (__ratelimit(&ratelimit_state))
+				if (!atomic_read(&adev->throttling_logging_enabled))
+					return 0;
+
+				if (__ratelimit(&adev->throttling_logging_rs))
 					smu_log_thermal_throttling(smu);
 
 				break;
-- 
2.29.2