Blob Blame History Raw
From 69caccc6e66929253447b21b422fb1f051b90066 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 7 Dec 2021 22:03:52 -0500
Subject: drm/amdkfd: Fix DQM asserts on Hawaii
Git-commit: 6f4cb84ae0f6d56d0b5071a0057df9c7cfb82ce1
Patch-mainline: v5.17-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225

start_nocpsch would never set dqm->sched_running on Hawaii due to an
early return statement. This would trigger asserts in other functions
and end up in inconsistent states.

Bug: https://github.com/RadeonOpenCompute/ROCm/issues/1624
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Kent Russell <kent.russell@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 19890e350107..4b6814949aad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1004,14 +1004,17 @@ static void uninitialize(struct device_queue_manager *dqm)
 
 static int start_nocpsch(struct device_queue_manager *dqm)
 {
+	int r = 0;
+
 	pr_info("SW scheduler is used");
 	init_interrupts(dqm);
 	
 	if (dqm->dev->adev->asic_type == CHIP_HAWAII)
-		return pm_init(&dqm->packet_mgr, dqm);
-	dqm->sched_running = true;
+		r = pm_init(&dqm->packet_mgr, dqm);
+	if (!r)
+		dqm->sched_running = true;
 
-	return 0;
+	return r;
 }
 
 static int stop_nocpsch(struct device_queue_manager *dqm)
-- 
2.38.1