Blob Blame History Raw
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 2 Jun 2018 11:48:53 +0100
Subject: drm/i915: Declare the driver wedged if hangcheck makes no progress
Git-commit: 1fd00c0faeec0a89dfe97de842facbecc8a04efb
Patch-mainline: v4.19-rc1
References: FATE#326289 FATE#326079 FATE#326049 FATE#322398 FATE#326166

Hangcheck is our back up in case the GPU or the driver gets stuck. It
detects when the GPU is not making any progress and issues a GPU reset.
However, if the driver is failing to make any progress, we can get
ourselves into a situation where we continually try resetting the GPU to
no avail. Employ a second timeout such that if we continue to see the
same seqno (the stalled engine has made no progress at all) over the
course of several hangchecks, declare the driver wedged and attempt to
start afresh.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180602104853.17140-1-chris@chris-wilson.co.uk
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Acked-by: Petr Tesarik <ptesarik@suse.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |    5 +++--
 drivers/gpu/drm/i915/i915_drv.h         |    2 ++
 drivers/gpu/drm/i915/intel_hangcheck.c  |   17 ++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |    3 ++-
 4 files changed, 23 insertions(+), 4 deletions(-)

--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1358,11 +1358,12 @@ static int i915_hangcheck_info(struct se
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
 					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled));
+			   yesno(engine->hangcheck.stalled),
+			   yesno(engine->hangcheck.wedged));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1013,6 +1013,8 @@ struct i915_gem_mm {
 #define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
 #define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
 
+#define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
+
 enum modeset_restore {
 	MODESET_ON_LID_OPEN,
 	MODESET_DONE,
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -294,6 +294,7 @@ static void hangcheck_store_sample(struc
 	engine->hangcheck.seqno = hc->seqno;
 	engine->hangcheck.action = hc->action;
 	engine->hangcheck.stalled = hc->stalled;
+	engine->hangcheck.wedged = hc->wedged;
 }
 
 static enum intel_engine_hangcheck_action
@@ -368,6 +369,9 @@ static void hangcheck_accumulate_sample(
 
 	hc->stalled = time_after(jiffies,
 				 engine->hangcheck.action_timestamp + timeout);
+	hc->wedged = time_after(jiffies,
+				 engine->hangcheck.action_timestamp +
+				 I915_ENGINE_WEDGED_TIMEOUT);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -409,7 +413,7 @@ static void i915_hangcheck_elapsed(struc
 			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int hung = 0, stuck = 0;
+	unsigned int hung = 0, stuck = 0, wedged = 0;
 
 	if (!i915_modparams.enable_hangcheck)
 		return;
@@ -440,6 +444,17 @@ static void i915_hangcheck_elapsed(struc
 			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
+
+		if (engine->hangcheck.wedged)
+			wedged |= intel_engine_flag(engine);
+	}
+
+	if (wedged) {
+		dev_err(dev_priv->drm.dev,
+			"GPU recovery timed out,"
+			" cancelling all in-flight rendering.\n");
+		GEM_TRACE_DUMP();
+		i915_gem_set_wedged(dev_priv);
 	}
 
 	if (hung)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -121,7 +121,8 @@ struct intel_engine_hangcheck {
 	int deadlock;
 	struct intel_instdone instdone;
 	struct i915_request *active_request;
-	bool stalled;
+	bool stalled:1;
+	bool wedged:1;
 };
 
 struct intel_ring {