From aaa6a421898cd138b38f50a46c9474be2634b4f3 Mon Sep 17 00:00:00 2001
From: John Clements <john.clements@amd.com>
Date: Wed, 25 Mar 2020 15:56:31 +0800
Subject: drm/amdgpu: added xgmi ras error reset sequence
Git-commit: 66399248feaf4a2fa4cd76765412a4139aca28e9
Patch-mainline: v5.8-rc1
References: jsc#SLE-12680, jsc#SLE-12880, jsc#SLE-12882, jsc#SLE-12883, jsc#SLE-13496, jsc#SLE-15322
added mechanism to clear xgmi ras status inbetween error queries
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Patrik Jakobsson <pjakobsson@suse.de>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 30 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 +
2 files changed, 31 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 95b3327168ac..8c3215505e78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -604,6 +604,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
+ amdgpu_xgmi_reset_ras_error_count(adev);
+
if (!adev->gmc.xgmi.ras_if) {
adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!adev->gmc.xgmi.ras_if)
@@ -668,6 +670,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
return addr + dram_base_addr;
}
+static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
+{
+ WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
+ WREG32_PCIE(pcs_status_reg, 0);
+}
+
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
+{
+ uint32_t i;
+
+ switch (adev->asic_type) {
+ case CHIP_ARCTURUS:
+ for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
+ pcs_clear_status(adev,
+ xgmi_pcs_err_status_reg_arct[i]);
+ break;
+ case CHIP_VEGA20:
+ for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
+ pcs_clear_status(adev,
+ xgmi_pcs_err_status_reg_vg20[i]);
+ break;
+ default:
+ break;
+ }
+}
+
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
uint32_t value,
uint32_t *ue_count,
@@ -758,6 +786,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
break;
}
+ amdgpu_xgmi_reset_ras_error_count(adev);
+
err_data->ue_count += ue_cnt;
err_data->ce_count += ce_cnt;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 4a92067fe595..d5a63904ec33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -56,6 +56,7 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
uint64_t addr);
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status);
+void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
struct amdgpu_device *bo_adev)
--
2.28.0