Blob Blame History Raw
From dec7b17b1ff38ee0fd1a1016384fbaa0b678b4dc Mon Sep 17 00:00:00 2001
From: Tao Zhou <tao.zhou1@amd.com>
Date: Mon, 9 May 2022 17:52:15 +0800
Subject: drm/amdgpu: refine RAS poison consumption handler
Git-commit: b63ac5d3033976301f296d048c54d584dfb3ac30
Patch-mainline: v5.19-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225

Qeury ras status before ras poison consumption handling, add more
comment and log.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-and-tested-by: Mohammad Zafar Ziya <Mohammadzafar.ziya@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++++----------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index defc6a53c7dc..035891ec59d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1538,33 +1538,42 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
 				struct amdgpu_iv_entry *entry)
 {
-	bool poison_stat = true, need_reset = true;
+	bool poison_stat = false;
 	struct amdgpu_device *adev = obj->adev;
 	struct ras_err_data err_data = {0, 0, 0, NULL};
 	struct amdgpu_ras_block_object *block_obj =
 		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
-	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_poison_handler(adev, &err_data, false);
-
-	/* both query_poison_status and handle_poison_consumption are optional */
-	if (block_obj && block_obj->hw_ops) {
-		if (block_obj->hw_ops->query_poison_status) {
-			poison_stat = block_obj->hw_ops->query_poison_status(adev);
-			if (!poison_stat)
-				dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
-						block_obj->ras_comm.name);
-		}
+	if (!block_obj || !block_obj->hw_ops)
+		return;
 
-		if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
-			poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
-			need_reset = poison_stat;
+	/* both query_poison_status and handle_poison_consumption are optional,
+	 * but at least one of them should be implemented if we need poison
+	 * consumption handler
+	 */
+	if (block_obj->hw_ops->query_poison_status) {
+		poison_stat = block_obj->hw_ops->query_poison_status(adev);
+		if (!poison_stat) {
+			/* Not poison consumption interrupt, no need to handle it */
+			dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+					block_obj->ras_comm.name);
+
+			return;
 		}
 	}
 
-	/* gpu reset is fallback for all failed cases */
-	if (need_reset)
+	if (!adev->gmc.xgmi.connected_to_cpu)
+		amdgpu_umc_poison_handler(adev, &err_data, false);
+
+	if (block_obj->hw_ops->handle_poison_consumption)
+		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+
+	/* gpu reset is fallback for failed and default cases */
+	if (poison_stat) {
+		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
+				block_obj->ras_comm.name);
 		amdgpu_ras_reset_gpu(adev);
+	}
 }
 
 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
-- 
2.38.1