Blob Blame History Raw
From 221ab9719bf33ad2984928d2afb20988d652a289 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 16 Sep 2017 21:44:14 +0100
Subject: [PATCH] drm/i915/execlists: Unwind incomplete requests on resets
Mime-version: 1.0
Content-type: text/plain; charset=UTF-8
Content-transfer-encoding: 8bit
Git-commit: 221ab9719bf33ad2984928d2afb20988d652a289
Patch-mainline: v4.15-rc1
References: FATE#322643 bsc#1055900

Given the mechanism to unwind and replay requests (designed to support
preemption), we have an alternative to the current method of
resubmitting the ELSP upon reset. Resubmitting ELSP turns out to be more
complicated than expected, due to having to handle lost context-switch
interrupts and so guessing what ELSP we need to resubmit later. Instead,
by unwinding the requests and clearing the ELSP tracking entirely, we
can then just dequeue the first pair of ready requests after resetting,
using the normal submission procedure.

Currently, the unwound requests have maximum priority and so are
guaranteed to be resubmitted upon resume. If we are lucky, we may be
able to coalesce a new request on top!

Suggested-by: Michał Winiarski <michal.winiarski@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170916204414.32762-4-chris@chris-wilson.co.uk
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Acked-by: Takashi Iwai <tiwai@suse.de>

---
 drivers/gpu/drm/i915/intel_lrc.c |   61 ++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1308,9 +1308,6 @@ static u8 gtiir[] = {
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	struct execlist_port *port = engine->execlist_port;
-	unsigned int n;
-	bool submit;
 	int ret;
 
 	ret = intel_mocs_init_engine(engine);
@@ -1346,26 +1343,8 @@ static int gen8_init_common_ring(struct
 	engine->csb_head = -1;
 
 	/* After a GPU reset, we may have requests to replay */
-	submit = false;
-	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
-		if (!port_isset(&port[n]))
-			break;
-
-		DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
-				 engine->name, n,
-				 port_request(&port[n])->global_seqno);
-
-		/* Discard the current inflight count */
-		port_set(&port[n], port_request(&port[n]));
-		submit = true;
-	}
-
-	if (!i915.enable_guc_submission) {
-		if (submit)
-			execlists_submit_ports(engine);
-		else if (engine->execlist_first)
-			tasklet_schedule(&engine->irq_tasklet);
-	}
+	if (!i915.enable_guc_submission && engine->execlist_first)
+		tasklet_schedule(&engine->irq_tasklet);
 
 	return 0;
 }
@@ -1407,9 +1386,13 @@ static void reset_common_ring(struct int
 			      struct drm_i915_gem_request *request)
 {
 	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq, *rn;
 	struct intel_context *ce;
+	unsigned long flags;
 	unsigned int n;
 
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+
 	/*
 	 * Catch up with any missed context-switch interrupts.
 	 *
@@ -1419,20 +1402,28 @@ static void reset_common_ring(struct int
 	 * guessing the missed context-switch events by looking at what
 	 * requests were completed.
 	 */
-	if (!request) {
-		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
-			i915_gem_request_put(port_request(&port[n]));
-		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
-		return;
-	}
+	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
+		i915_gem_request_put(port_request(&port[n]));
+	memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+
+	/* Push back any incomplete requests for replay after the reset. */
+	list_for_each_entry_safe_reverse(rq, rn,
+					 &engine->timeline->requests, link) {
+		struct i915_priolist *p;
+
+		if (i915_gem_request_completed(rq))
+			break;
+
+		__i915_gem_request_unsubmit(rq);
 
-	if (request->ctx != port_request(port)->ctx) {
-		i915_gem_request_put(port_request(port));
-		port[0] = port[1];
-		memset(&port[1], 0, sizeof(port[1]));
+		p = lookup_priolist(engine,
+				    &rq->priotree,
+				    rq->priotree.priority);
+		list_add(&rq->priotree.link,
+			 &ptr_mask_bits(p, 1)->requests);
 	}
 
-	GEM_BUG_ON(request->ctx != port_request(port)->ctx);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1444,7 +1435,7 @@ static void reset_common_ring(struct int
 	 * and have to at least restore the RING register in the context
 	 * image back to the expected values to skip over the guilty request.
 	 */
-	if (request->fence.error != -EIO)
+	if (!request || request->fence.error != -EIO)
 		return;
 
 	/* We want a simple context + ring to execute the breadcrumb update.