Blob Blame History Raw
From 671cae75ad4ab4d5eca78e68714e20420c8e1524 Mon Sep 17 00:00:00 2001
From: John Clements <john.clements@amd.com>
Date: Wed, 15 Sep 2021 17:14:12 +0800
Subject: drm/amdgpu: Resolve nBIF RAS error harvesting bug
Git-commit: 226f4f5a6b6cb417ab2ed8550ea47c10190426a1
Patch-mainline: v5.16-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225

Set correct RAS nBIF error query register offsets on aldebaran

Signed-off-by: John Clements <john.clements@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 30 ++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index f50045cebd44..0a9fc19b1be0 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -566,7 +566,9 @@ static int nbio_v7_4_init_ras_err_event_athub_interrupt (struct amdgpu_device *a
 	return r;
 }
 
-#define smnPARITY_ERROR_STATUS_UNCORR_GRP2	0x13a20030
+#define smnPARITY_ERROR_STATUS_UNCORR_GRP2	    0x13a20030
+#define smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE	0x13b20030
+#define smnRAS_GLOBAL_STATUS_LO_ALDE            0x13b20020
 
 static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
 					void *ras_error_status)
@@ -575,12 +577,20 @@ static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
 	uint32_t corr, fatal, non_fatal;
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 
-	global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO);
+	if (adev->asic_type == CHIP_ALDEBARAN)
+		global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO_ALDE);
+	else
+		global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO);
+
 	corr = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO, ParityErrCorr);
 	fatal = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO, ParityErrFatal);
 	non_fatal = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO,
 				ParityErrNonFatal);
-	parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2);
+
+	if (adev->asic_type == CHIP_ALDEBARAN)
+		parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE);
+	else
+		parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2);
 
 	if (corr)
 		err_data->ce_count++;
@@ -589,13 +599,21 @@ static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
 
 	if (corr || fatal || non_fatal) {
 		central_sts = RREG32_PCIE(smnBIFL_RAS_CENTRAL_STATUS);
+
 		/* clear error status register */
-		WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO, global_sts);
+		if (adev->asic_type == CHIP_ALDEBARAN)
+			WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO_ALDE, global_sts);
+		else
+			WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO, global_sts);
 
 		if (fatal)
+		{
 			/* clear parity fatal error indication field */
-			WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2,
-				    parity_sts);
+			if (adev->asic_type == CHIP_ALDEBARAN)
+				WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE, parity_sts);
+			else
+				WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2, parity_sts);
+		}
 
 		if (REG_GET_FIELD(central_sts, BIFL_RAS_CENTRAL_STATUS,
 				BIFL_RasContller_Intr_Recv)) {
-- 
2.38.1