From c802273d12c89f5f0524ef1bda7b7c969ac8d226 Mon Sep 17 00:00:00 2001
From: John Clements <john.clements@amd.com>
Date: Thu, 19 Mar 2020 14:41:55 +0800
Subject: drm/amdgpu: protect RAS sysfs during GPU reset
Git-commit: 43c4d57618bef018eecd769c2805ce6f4e849a0d
Patch-mainline: v5.7-rc1
References: jsc#SLE-12680, jsc#SLE-12880, jsc#SLE-12882, jsc#SLE-12883, jsc#SLE-13496, jsc#SLE-15322
MMHub EDC becomes dirty after BACO reset
EDC registers should be cleared early on in reset phase
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Patrik Jakobsson <pjakobsson@suse.de>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++++
2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f469facabfb..faa3e7102156 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2742,6 +2742,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (adev->asic_reset_res)
goto fail;
+
+ if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
+ adev->mmhub.funcs->reset_ras_error_count(adev);
} else {
task_barrier_full(&hive->tb);
@@ -3910,8 +3913,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
}
- if (!r && amdgpu_ras_intr_triggered())
+ if (!r && amdgpu_ras_intr_triggered()) {
+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ if (tmp_adev->mmhub.funcs &&
+ tmp_adev->mmhub.funcs->reset_ras_error_count)
+ tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
+ }
+
amdgpu_ras_intr_cleared();
+ }
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
if (need_full_reset) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 43055a01f35e..3c32a94d2424 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -281,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
struct ras_debug_if data;
int ret = 0;
+ if (amdgpu_ras_intr_triggered()) {
+ DRM_WARN("RAS WARN: error injection currently inaccessible\n");
+ return size;
+ }
+
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
if (ret)
return -EINVAL;
@@ -394,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
.head = obj->head,
};
+ if (amdgpu_ras_intr_triggered())
+ return snprintf(buf, PAGE_SIZE,
+ "Query currently inaccessible\n");
+
if (amdgpu_ras_error_query(obj->adev, &info))
return -EINVAL;
--
2.28.0