Blob Blame History Raw
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 2 Mar 2018 14:32:45 +0000
Subject: drm/i915/execlists: Split spinlock from its irq disabling side-effect
Git-commit: a3e3883646c05828f96369f7809436f1b3247234
Patch-mainline: v4.17-rc1
References: FATE#326289 FATE#326079 FATE#326049 FATE#322398 FATE#326166

During reset/wedging, we have to clean up the requests on the timeline
and flush the pending interrupt state. Currently, we are abusing the irq
disabling of the timeline spinlock to protect the irq state in
conjunction to the engine's timeline requests, but this is accidental
and conflates the spinlock with the irq state. A baffling state of
affairs for the reader.

Instead, explicitly disable irqs over the critical section, and separate
modifying the irq state from the timeline's requests.

Suggested-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180302143246.2579-4-chris@chris-wilson.co.uk
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Acked-by: Petr Tesarik <ptesarik@suse.com>
---
 drivers/gpu/drm/i915/intel_lrc.c |   35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -689,11 +689,27 @@ static void execlists_cancel_requests(st
 
 	GEM_TRACE("%s\n", engine->name);
 
-	spin_lock_irqsave(&engine->timeline->lock, flags);
+	/*
+	 * Before we call engine->cancel_requests(), we should have exclusive
+	 * access to the submission state. This is arranged for us by the
+	 * caller disabling the interrupt generation, the tasklet and other
+	 * threads that may then access the same state, giving us a free hand
+	 * to reset state. However, we still need to let lockdep be aware that
+	 * we know this state may be accessed in hardirq context, so we
+	 * disable the irq around this manipulation and we want to keep
+	 * the spinlock focused on its duties and not accidentally conflate
+	 * coverage to the submission's irq state. (Similarly, although we
+	 * shouldn't need to disable irq around the manipulation of the
+	 * submission's irq state, we also wish to remind ourselves that
+	 * it is irq state.)
+	 */
+	local_irq_save(flags);
 
 	/* Cancel the requests on the HW and clear the ELSP tracker. */
 	execlists_cancel_port_requests(execlists);
 
+	spin_lock(&engine->timeline->lock);
+
 	/* Mark all executing requests as skipped. */
 	list_for_each_entry(rq, &engine->timeline->requests, link) {
 		GEM_BUG_ON(!rq->global_seqno);
@@ -727,6 +743,8 @@ static void execlists_cancel_requests(st
 	execlists->first = NULL;
 	GEM_BUG_ON(port_isset(execlists->port));
 
+	spin_unlock(&engine->timeline->lock);
+
 	/*
 	 * The port is checked prior to scheduling a tasklet, but
 	 * just in case we have suspended the tasklet to do the
@@ -738,7 +756,7 @@ static void execlists_cancel_requests(st
 	/* Mark all CS interrupts as complete */
 	execlists->active = 0;
 
-	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+	local_irq_restore(flags);
 }
 
 /*
@@ -1618,7 +1636,8 @@ static void reset_common_ring(struct int
 	GEM_TRACE("%s seqno=%x\n",
 		  engine->name, request ? request->global_seqno : 0);
 
-	spin_lock_irqsave(&engine->timeline->lock, flags);
+	/* See execlists_cancel_requests() for the irq/spinlock split. */
+	local_irq_save(flags);
 
 	reset_irq(engine);
 
@@ -1634,14 +1653,17 @@ static void reset_common_ring(struct int
 	execlists_cancel_port_requests(execlists);
 
 	/* Push back any incomplete requests for replay after the reset. */
+	spin_lock(&engine->timeline->lock);
 	__unwind_incomplete_requests(engine);
+	spin_unlock(&engine->timeline->lock);
 
 	/* Mark all CS interrupts as complete */
 	execlists->active = 0;
 
-	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+	local_irq_restore(flags);
 
-	/* If the request was innocent, we leave the request in the ELSP
+	/*
+	 * If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
 	 * have been corrupted by the reset, in which case we may have
 	 * to service a new GPU hang, but more likely we can continue on
@@ -1654,7 +1676,8 @@ static void reset_common_ring(struct int
 	if (!request || request->fence.error != -EIO)
 		return;
 
-	/* We want a simple context + ring to execute the breadcrumb update.
+	/*
+	 * We want a simple context + ring to execute the breadcrumb update.
 	 * We cannot rely on the context being intact across the GPU hang,
 	 * so clear it and rebuild just what we need for the breadcrumb.
 	 * All pending requests for this context will be zapped, and any