Blob Blame History Raw
From b3f6cd22c0e6528bfa776cd28c2ba2048f0dbc1b Mon Sep 17 00:00:00 2001
From: Tao Zhou <tao.zhou1@amd.com>
Date: Thu, 12 Oct 2023 14:33:37 +0800
Subject: drm/amdgpu: bypass RAS error reset in some conditions
Git-commit: 73582be11ac8f6d6765e185bf48f22efb9d28c3b
Patch-mainline: v6.7-rc1
References: jsc#PED-3527 jsc#PED-5475 jsc#PED-6068 jsc#PED-6070 jsc#PED-6116 jsc#PED-6120 jsc#PED-5065 jsc#PED-5477 jsc#PED-5511 jsc#PED-6041 jsc#PED-6069 jsc#PED-6071

PMFW is responsible for RAS error reset in some conditions, driver can
skip the operation.

v2: add check for ras->in_recovery, it's set earlier than
amdgpu_in_reset.

v3: fix error in gpu reset check.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f4c2c737b12f..303fbb6a48b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1220,6 +1220,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
 		enum amdgpu_ras_block block)
 {
 	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
 
 	if (!block_obj || !block_obj->hw_ops) {
 		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1227,7 +1229,13 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
 		return -EOPNOTSUPP;
 	}
 
-	if (!amdgpu_ras_is_supported(adev, block))
+	/* skip ras error reset in gpu reset */
+	if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
+	    mca_funcs && mca_funcs->mca_set_debug_mode)
+		return -EOPNOTSUPP;
+
+	if (!amdgpu_ras_is_supported(adev, block) ||
+	    !amdgpu_ras_get_mca_debug_mode(adev))
 		return -EOPNOTSUPP;
 
 	if (block_obj->hw_ops->reset_ras_error_count)
-- 
2.43.0