Blob Blame History Raw
From cfbfa05e6d90238f033fa18e63b16a3ea1bf2ff8 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Thu, 9 Dec 2021 11:31:13 -0800
Subject: drm/msm/a6xx: Skip crashdumper state if GPU needs_hw_init
Git-commit: 08c4aa3ee26445ae0d027c1e1944e1202687b024
Patch-mainline: v5.17-rc1
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225

I am seeing some crash logs which imply that we are trying to use
crashdumper hw to read back GPU state when the GPU isn't initialized.
This doesn't go well (for example, GPU could be in 32b address mode
and ignoring the upper bits of buffer that it is trying to dump state
to).

I'm not *quite* sure how we get into this state in the first place,
but lets not make a bad situation worse by triggering iova fault
crashes.

While we're at it, also add the information about whether the GPU is
initialized to the devcore dump to make this easier to see in the
logs (which makes the WARN_ON() redundant and even harmful because
it fills up the small bit of dmesg we get with the crash report).

Signed-off-by: Rob Clark <robdclark@chromium.org>
Link: https://lore.kernel.org/r/20211209193118.1163248-1-robdclark@gmail.com
Signed-off-by: Rob Clark <robdclark@chromium.org>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 9 ++++++++-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c     | 1 -
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
index bdd0059a81ff..55f443328d8e 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
@@ -49,6 +49,8 @@ struct a6xx_gpu_state {
 	s32 hfi_queue_history[2][HFI_HISTORY_SZ];
 
 	struct list_head objs;
+
+	bool gpu_initialized;
 };
 
 static inline int CRASHDUMP_WRITE(u64 *in, u32 reg, u32 val)
@@ -1001,7 +1003,8 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
 	 * write out GPU state, so we need to skip this when the SMMU is
 	 * stalled in response to an iova fault
 	 */
-	if (!stalled && !a6xx_crashdumper_init(gpu, &_dumper)) {
+	if (!stalled && !gpu->needs_hw_init &&
+	    !a6xx_crashdumper_init(gpu, &_dumper)) {
 		dumper = &_dumper;
 	}
 
@@ -1018,6 +1021,8 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
 	if (snapshot_debugbus)
 		a6xx_get_debugbus(gpu, a6xx_state);
 
+	a6xx_state->gpu_initialized = !gpu->needs_hw_init;
+
 	return  &a6xx_state->base;
 }
 
@@ -1246,6 +1251,8 @@ void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 	if (IS_ERR_OR_NULL(state))
 		return;
 
+	drm_printf(p, "gpu-initialized: %d\n", a6xx_state->gpu_initialized);
+
 	adreno_show(gpu, state, p);
 
 	drm_puts(p, "gmu-log:\n");
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 47cb40bdbd43..f33cfa4ef1c8 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -504,7 +504,6 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	int i, count = 0;
 
-	WARN_ON(gpu->needs_hw_init);
 	WARN_ON(!mutex_is_locked(&gpu->lock));
 
 	kref_init(&state->ref);
-- 
2.38.1