Blob Blame History Raw
From 0c4b1270b0ef00ea64cbd84be7de96c34dbc4bb6 Mon Sep 17 00:00:00 2001
From: shaoyunl <shaoyun.liu@amd.com>
Date: Tue, 6 Sep 2022 23:05:11 -0400
Subject: drm/amdgpu: Use per device reset_domain for XGMI on sriov
 configuration
Git-commit: 46c676600c715f833b066581247cd5a461e03441
Patch-mainline: v6.1-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225 jsc#PED-2849

For SRIOV configuration, host driver control the reset method(either FLR or
heavier chain reset). The host will notify the guest individually with FLR
message if individual GPU within the hive need to be reset. So for guest
side, no need to use hive->reset_domain to replace the original per
device reset_domain

Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 37 ++++++++++++++--------
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7508fc050d62..d36a8b40a49d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2461,17 +2461,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		if (amdgpu_xgmi_add_device(adev) == 0) {
 			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
-			if (!hive->reset_domain ||
-			    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
-				r = -ENOENT;
+			if (!amdgpu_sriov_vf(adev)) {
+				if (!hive->reset_domain ||
+				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+					r = -ENOENT;
+					amdgpu_put_xgmi_hive(hive);
+					goto init_failed;
+				}
+
+				/* Drop the early temporary reset domain we created for device */
+				amdgpu_reset_put_reset_domain(adev->reset_domain);
+				adev->reset_domain = hive->reset_domain;
 				amdgpu_put_xgmi_hive(hive);
-				goto init_failed;
 			}
-
-			/* Drop the early temporary reset domain we created for device */
-			amdgpu_reset_put_reset_domain(adev->reset_domain);
-			adev->reset_domain = hive->reset_domain;
-			amdgpu_put_xgmi_hive(hive);
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d3b483aa81f8..10477ca52a41 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -391,24 +391,33 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 		goto pro_end;
 	}
 
+	/**
+	 * Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
+	 * Host driver decide how to reset the GPU either through FLR or chain reset.
+	 * Guest side will get individual notifications from the host for the FLR
+	 * if necessary.
+	 */
+	if (!amdgpu_sriov_vf(adev)) {
 	/**
 	 * Avoid recreating reset domain when hive is reconstructed for the case
-	 * of reset the devices in the XGMI hive during probe for SRIOV
+	 * of reset the devices in the XGMI hive during probe for passthrough GPU
 	 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
 	 */
-	if (adev->reset_domain->type != XGMI_HIVE) {
-		hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
-			if (!hive->reset_domain) {
-				dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
-				ret = -ENOMEM;
-				kobject_put(&hive->kobj);
-				kfree(hive);
-				hive = NULL;
-				goto pro_end;
-			}
-	} else {
-		amdgpu_reset_get_reset_domain(adev->reset_domain);
-		hive->reset_domain = adev->reset_domain;
+		if (adev->reset_domain->type != XGMI_HIVE) {
+			hive->reset_domain =
+				amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+				if (!hive->reset_domain) {
+					dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
+					ret = -ENOMEM;
+					kobject_put(&hive->kobj);
+					kfree(hive);
+					hive = NULL;
+					goto pro_end;
+				}
+		} else {
+			amdgpu_reset_get_reset_domain(adev->reset_domain);
+			hive->reset_domain = adev->reset_domain;
+		}
 	}
 
 	hive->hive_id = adev->gmc.xgmi.hive_id;
-- 
2.38.1