From a3996639d8a7a44086c322332b2d129de83cea05 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 28 May 2020 08:41:00 +0100
Subject: drm/i915/gt: Don't declare hangs if engine is stalled
Git-commit: ba03a63d76ac8131fad58c34fb793d18b0a8964c
Patch-mainline: v5.9-rc1
References: jsc#SLE-12680, jsc#SLE-12880, jsc#SLE-12882, jsc#SLE-12883, jsc#SLE-13496, jsc#SLE-15322
If the ring submission is stalled on an external request, nothing can be
submitted, not even the heartbeat in the kernel context. Since nothing
is running, resetting the engine/device does not unblock the system and
is pointless. We can see if the heartbeat is supposed to be running
before declaring foul.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200528074109.28235-2-chris@chris-wilson.co.uk
Signed-off-by: Patrik Jakobsson <pjakobsson@suse.de>
---
.../gpu/drm/i915/gt/intel_engine_heartbeat.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 5136c8bf112d..f67ad937eefb 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -48,8 +48,10 @@ static void show_heartbeat(const struct i915_request *rq,
struct drm_printer p = drm_debug_printer("heartbeat");
intel_engine_dump(engine, &p,
- "%s heartbeat {prio:%d} not ticking\n",
+ "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
engine->name,
+ rq->fence.context,
+ rq->fence.seqno,
rq->sched.attr.priority);
}
@@ -76,8 +78,19 @@ static void heartbeat(struct work_struct *wrk)
goto out;
if (engine->heartbeat.systole) {
- if (engine->schedule &&
- rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
+ if (!i915_sw_fence_signaled(&rq->submit)) {
+ /*
+ * Not yet submitted, system is stalled.
+ *
+ * This more often happens for ring submission,
+ * where all contexts are funnelled into a common
+ * ringbuffer. If one context is blocked on an
+ * external fence, not only is it not submitted,
+ * but all other contexts, including the kernel
+ * context are stuck waiting for the signal.
+ */
+ } else if (engine->schedule &&
+ rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
/*
* Gradually raise the priority of the heartbeat to
* give high priority work [which presumably desires
--
2.29.2