Blob Blame History Raw
From 1a6fc071e1991321d3b6a00e0e7c733a462a4418 Mon Sep 17 00:00:00 2001
From: Tao Zhou <tao.zhou1@amd.com>
Date: Fri, 30 Aug 2019 19:50:39 +0800
Subject: drm/amdgpu: move the call of ras recovery_init and bad page reserve
 to proper place
Git-commit: 1a6fc071e1991321d3b6a00e0e7c733a462a4418
Patch-mainline: v5.5-rc1
References: bsc#1152489

ras recovery_init should be called after ttm init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 39 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 12 +++++++
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ad034aa0e3c..3268291babf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 53540e067d15..e9bd40ea7ce0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1493,16 +1493,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
-	if (!*data)
-		return -ENOMEM;
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+	if (!*data) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	mutex_init(&con->recovery_lock);
 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
@@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto free;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto free;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+free:
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+out:
+	DRM_WARN("Failed to initialize ras recovery!\n");
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data = con->eh_data;
 
+	/* recovery_init failed to init it, fini is useless */
+	if (!data)
+		return 0;
+
 	cancel_work_sync(&con->recovery_work);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 96210e18191e..012034d2ae06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));
 }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 13b144c8f67d..54e6dacc34a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -54,6 +54,7 @@
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
 #include "bif/bif_4_1_d.h"
 
 static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
@@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 						adev->gmc.visible_vram_size);
 #endif
 
+	/*
+	 * retired pages will be loaded from eeprom and reserved here,
+	 * it should be called after ttm init since new bo may be created,
+	 * recovery_init may fail, but it can free all resources allocated by
+	 * itself and its failure should not stop amdgpu init process.
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/*
 	 *The reserved vram for firmware must be pinned to the specified
 	 *place on the VRAM, so reserve it early.
-- 
2.28.0