Blob Blame History Raw
From 2725ccb10f3cc89fd21baadb04a451df4278a7c6 Mon Sep 17 00:00:00 2001
From: John Clements <john.clements@amd.com>
Date: Thu, 26 Aug 2021 13:55:52 +0800
Subject: drm/amdgpu: Add support for RAS XGMI err query
Git-commit: 3c4ff2dcc0dffbfa79f7f55237f502a74ed018b7
Patch-mainline: v5.15-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225

Update XGMI RAS to support error query on aldebaran

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 65 ++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e2a8dfea003c..978ac927ac11 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -32,6 +32,10 @@
 #include "wafl/wafl2_4_0_0_smn.h"
 #include "wafl/wafl2_4_0_0_sh_mask.h"
 
+#define smnPCS_XGMI23_PCS_ERROR_STATUS   0x11a01210
+#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
+#define smnPCS_GOPX1_PCS_ERROR_STATUS    0x12200210
+
 static DEFINE_MUTEX(xgmi_mutex);
 
 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE		4
@@ -63,6 +67,33 @@ static const int wafl_pcs_err_status_reg_arct[] = {
 	smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
 };
 
+static const int xgmi23_pcs_err_status_reg_aldebaran[] = {
+	smnPCS_XGMI23_PCS_ERROR_STATUS,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000,
+	smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000
+};
+
+static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
+	smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
+};
+
+static const int walf_pcs_err_status_reg_aldebaran[] = {
+	smnPCS_GOPX1_PCS_ERROR_STATUS,
+	smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
+};
+
 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
 	{"XGMI PCS DataLossErr",
 	 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -771,6 +802,17 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
 			pcs_clear_status(adev,
 					 xgmi_pcs_err_status_reg_vg20[i]);
 		break;
+	case CHIP_ALDEBARAN:
+		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
+			pcs_clear_status(adev,
+					 xgmi23_pcs_err_status_reg_aldebaran[i]);
+		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
+			pcs_clear_status(adev,
+					 xgmi23_pcs_err_status_reg_aldebaran[i]);
+		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
+			pcs_clear_status(adev,
+					 walf_pcs_err_status_reg_aldebaran[i]);
+		break;
 	default:
 		break;
 	}
@@ -863,6 +905,29 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 						data, &ue_cnt, &ce_cnt, false);
 		}
 		break;
+	case CHIP_ALDEBARAN:
+		/* check xgmi23 pcs error */
+		for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) {
+			data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]);
+			if (data)
+				amdgpu_xgmi_query_pcs_error_status(adev,
+						data, &ue_cnt, &ce_cnt, true);
+		}
+		/* check xgmi3x16 pcs error */
+		for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
+			data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
+			if (data)
+				amdgpu_xgmi_query_pcs_error_status(adev,
+						data, &ue_cnt, &ce_cnt, true);
+		}
+		/* check wafl pcs error */
+		for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
+			data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
+			if (data)
+				amdgpu_xgmi_query_pcs_error_status(adev,
+						data, &ue_cnt, &ce_cnt, false);
+		}
+		break;
 	default:
 		dev_warn(adev->dev, "XGMI RAS error query not supported");
 		break;
-- 
2.38.1