From cbe913982690a87e235a8633993aebffe07150ac Mon Sep 17 00:00:00 2001
From: Surbhi Kakarya <surbhi.kakarya@amd.com>
Date: Wed, 26 Jan 2022 12:04:39 -0500
Subject: drm/amdgpu: Handle the GPU recovery failure in SRIOV environment.
Git-commit: 7258fa31eabd882f6c8ed4d6d281f6657a33ef94
Patch-mainline: v5.18-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225
This patch handles the GPU recovery failure in sriov environment by
retrying the reset if the first reset fails. To determine the condition
of retry, a new macro AMDGPU_RETRY_SRIOV_RESET is added which returns
true if failure is due to ETIMEDOUT, EINVAL or EBUSY, otherwise return
false.A new macro AMDGPU_MAX_RETRY_LIMIT is used to limit the retry to 2.
It also handles the return status in Post Asic Reset by updating the return
code with asic_reset_res and eventually return the return code in
amdgpu_job_timedout().
Signed-off-by: Surbhi Kakarya <surbhi.kakarya@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++++-
2 files changed, 20 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -88,6 +88,8 @@
MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
+#define AMDGPU_MAX_RETRY_LIMIT 2
+#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
const char *amdgpu_asic_name[] = {
"TAHITI",
@@ -4366,7 +4368,9 @@
{
int r;
struct amdgpu_hive_info *hive = NULL;
+ int retry_limit = 0;
+retry:
amdgpu_amdkfd_pre_reset(adev);
if (from_hypervisor)
@@ -4413,6 +4417,14 @@
}
amdgpu_virt_release_full_gpu(adev, true);
+ if (AMDGPU_RETRY_SRIOV_RESET(r)) {
+ if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
+ retry_limit++;
+ goto retry;
+ } else
+ DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+ }
+
return r;
}
@@ -5204,6 +5216,9 @@
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
}
+ if (tmp_adev->asic_reset_res)
+ r = tmp_adev->asic_reset_res;
+
tmp_adev->asic_reset_res = 0;
if (r) {
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,6 +37,7 @@
struct amdgpu_task_info ti;
struct amdgpu_device *adev = ring->adev;
int idx;
+ int r;
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -63,7 +64,10 @@
ti.process_name, ti.tgid, ti.task_name, ti.pid);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
- amdgpu_device_gpu_recover(ring->adev, job);
+ r = amdgpu_device_gpu_recover(ring->adev, job);
+ if (r)
+ DRM_ERROR("GPU Recovery Failed: %d\n", r);
+
} else {
drm_sched_suspend_timeout(&ring->sched);
if (amdgpu_sriov_vf(adev))