Blob Blame History Raw
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 22 Feb 2018 14:22:29 +0000
Subject: drm/i915/preemption: Allow preemption between submission ports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Git-commit: f6322eddaff7662e81178a28730e420bf934a512
Patch-mainline: v4.17-rc1
References: FATE#326289 FATE#326079 FATE#326049 FATE#322398 FATE#326166

Sometimes we need to boost the priority of an in-flight request, which
may lead to the situation where the second submission port then contains
a higher priority context than the first and so we need to inject a
preemption event. To do so we must always check inside
execlists_dequeue() whether there is a priority inversion between the
ports themselves as well as the head of the priority sorted queue, and we
cannot just skip dequeuing if the queue is empty.

As Michał noted, this doesn't simply extend to handling more than 2-port
submission, as we may need to reorder within the array of executing
requests which themselves are lower priority than the first. A task for
later!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180222142229.14517-1-chris@chris-wilson.co.uk
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Acked-by: Petr Tesarik <ptesarik@suse.com>
---
 drivers/gpu/drm/i915/intel_engine_cs.c      |    2 
 drivers/gpu/drm/i915/intel_guc_submission.c |   17 +-
 drivers/gpu/drm/i915/intel_lrc.c            |  161 +++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   10 +
 4 files changed, 112 insertions(+), 78 deletions(-)

--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -423,6 +423,7 @@ static void intel_engine_init_execlist(s
 	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
 	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
 
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 }
@@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engi
 	spin_lock_irq(&engine->timeline->lock);
 	list_for_each_entry(rq, &engine->timeline->requests, link)
 		print_request(m, rq, "\t\tE ");
+	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
 	for (rb = execlists->first; rb; rb = rb_next(rb)) {
 		struct i915_priolist *p =
 			rb_entry(rb, typeof(*p), node);
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -75,6 +75,11 @@
  *
  */
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
 static inline bool is_high_priority(struct intel_guc_client *client)
 {
 	return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
@@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_eng
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
 
-	if (!rb)
-		goto unlock;
-
 	if (port_isset(port)) {
 		if (engine->i915->preempt_context) {
 			struct guc_preempt_work *preempt_work =
 				&engine->i915->guc.preempt_work[engine->id];
 
-			if (rb_entry(rb, struct i915_priolist, node)->priority >
+			if (execlists->queue_priority >
 			    max(port_request(port)->priotree.priority, 0)) {
 				execlists_set_active(execlists,
 						     EXECLISTS_ACTIVE_PREEMPT);
@@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_eng
 	}
 	GEM_BUG_ON(port_isset(port));
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_eng
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static inline int rq_prio(const struct i915_request *rq)
+{
+	return rq->priotree.priority;
+}
+
+static inline bool need_preempt(const struct intel_engine_cs *engine,
+				const struct i915_request *last,
+				int prio)
+{
+	return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+}
+
 /**
  * intel_lr_context_descriptor_update() - calculate & cache the descriptor
  * 					  descriptor for a pinned context
@@ -224,7 +241,7 @@ find_priolist:
 	parent = &execlists->queue.rb_node;
 	while (*parent) {
 		rb = *parent;
-		p = rb_entry(rb, typeof(*p), node);
+		p = to_priolist(rb);
 		if (prio > p->priority) {
 			parent = &rb->rb_left;
 		} else if (prio < p->priority) {
@@ -264,7 +281,7 @@ find_priolist:
 	if (first)
 		execlists->first = &p->node;
 
-	return ptr_pack_bits(p, first, 1);
+	return p;
 }
 
 static void unwind_wa_tail(struct i915_request *rq)
@@ -290,14 +307,10 @@ static void __unwind_incomplete_requests
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
 
-		GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
-		if (rq->priotree.priority != last_prio) {
-			p = lookup_priolist(engine,
-					    &rq->priotree,
-					    rq->priotree.priority);
-			p = ptr_mask_bits(p, 1);
-
-			last_prio = rq->priotree.priority;
+		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+		if (rq_prio(rq) != last_prio) {
+			last_prio = rq_prio(rq);
+			p = lookup_priolist(engine, &rq->priotree, last_prio);
 		}
 
 		list_add(&rq->priotree.link, &p->requests);
@@ -397,10 +410,11 @@ static void execlists_submit_ports(struc
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 
-			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name, n,
 				  port[n].context_id, count,
-				  rq->global_seqno);
+				  rq->global_seqno,
+				  rq_prio(rq));
 		} else {
 			GEM_BUG_ON(!n);
 			desc = 0;
@@ -453,12 +467,17 @@ static void inject_preempt_context(struc
 		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 
+	/*
+	 * Switch to our empty preempt context so
+	 * the state of the GPU is known (idle).
+	 */
 	GEM_TRACE("%s\n", engine->name);
 	for (n = execlists_num_ports(&engine->execlists); --n; )
 		elsp_write(0, engine->execlists.elsp);
 
 	elsp_write(ce->lrc_desc, engine->execlists.elsp);
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
+	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
@@ -495,8 +514,6 @@ static void execlists_dequeue(struct int
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	if (!rb)
-		goto unlock;
 
 	if (last) {
 		/*
@@ -519,54 +536,48 @@ static void execlists_dequeue(struct int
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			goto unlock;
 
-		if (engine->i915->preempt_context &&
-		    rb_entry(rb, struct i915_priolist, node)->priority >
-		    max(last->priotree.priority, 0)) {
-			/*
-			 * Switch to our empty preempt context so
-			 * the state of the GPU is known (idle).
-			 */
+		if (need_preempt(engine, last, execlists->queue_priority)) {
 			inject_preempt_context(engine);
-			execlists_set_active(execlists,
-					     EXECLISTS_ACTIVE_PREEMPT);
 			goto unlock;
-		} else {
-			/*
-			 * In theory, we could coalesce more requests onto
-			 * the second port (the first port is active, with
-			 * no preemptions pending). However, that means we
-			 * then have to deal with the possible lite-restore
-			 * of the second port (as we submit the ELSP, there
-			 * may be a context-switch) but also we may complete
-			 * the resubmission before the context-switch. Ergo,
-			 * coalescing onto the second port will cause a
-			 * preemption event, but we cannot predict whether
-			 * that will affect port[0] or port[1].
-			 *
-			 * If the second port is already active, we can wait
-			 * until the next context-switch before contemplating
-			 * new requests. The GPU will be busy and we should be
-			 * able to resubmit the new ELSP before it idles,
-			 * avoiding pipeline bubbles (momentary pauses where
-			 * the driver is unable to keep up the supply of new
-			 * work).
-			 */
-			if (port_count(&port[1]))
-				goto unlock;
-
-			/* WaIdleLiteRestore:bdw,skl
-			 * Apply the wa NOOPs to prevent
-			 * ring:HEAD == rq:TAIL as we resubmit the
-			 * request. See gen8_emit_breadcrumb() for
-			 * where we prepare the padding after the
-			 * end of the request.
-			 */
-			last->tail = last->wa_tail;
 		}
+
+		/*
+		 * In theory, we could coalesce more requests onto
+		 * the second port (the first port is active, with
+		 * no preemptions pending). However, that means we
+		 * then have to deal with the possible lite-restore
+		 * of the second port (as we submit the ELSP, there
+		 * may be a context-switch) but also we may complete
+		 * the resubmission before the context-switch. Ergo,
+		 * coalescing onto the second port will cause a
+		 * preemption event, but we cannot predict whether
+		 * that will affect port[0] or port[1].
+		 *
+		 * If the second port is already active, we can wait
+		 * until the next context-switch before contemplating
+		 * new requests. The GPU will be busy and we should be
+		 * able to resubmit the new ELSP before it idles,
+		 * avoiding pipeline bubbles (momentary pauses where
+		 * the driver is unable to keep up the supply of new
+		 * work). However, we have to double check that the
+		 * priorities of the ports haven't been switch.
+		 */
+		if (port_count(&port[1]))
+			goto unlock;
+
+		/*
+		 * WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent
+		 * ring:HEAD == rq:TAIL as we resubmit the
+		 * request. See gen8_emit_breadcrumb() for
+		 * where we prepare the padding after the
+		 * end of the request.
+		 */
+		last->tail = last->wa_tail;
 	}
 
-	do {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	while (rb) {
+		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -628,8 +639,9 @@ static void execlists_dequeue(struct int
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	} while (rb);
+	}
 done:
+	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
 	execlists->first = rb;
 	if (submit)
 		port_assign(port, last);
@@ -692,7 +704,7 @@ static void execlists_cancel_requests(st
 	/* Flush the queued requests to the timeline list (for retiring). */
 	rb = execlists->first;
 	while (rb) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+		struct i915_priolist *p = to_priolist(rb);
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			INIT_LIST_HEAD(&rq->priotree.link);
@@ -710,7 +722,7 @@ static void execlists_cancel_requests(st
 
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
-
+	execlists->queue_priority = INT_MIN;
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
 	GEM_BUG_ON(port_isset(execlists->port));
@@ -869,10 +881,11 @@ static void execlists_submission_tasklet
 							EXECLISTS_ACTIVE_USER));
 
 			rq = port_unpack(port, &count);
-			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
+			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
 				  engine->name,
 				  port->context_id, count,
-				  rq ? rq->global_seqno : 0);
+				  rq ? rq->global_seqno : 0,
+				  rq ? rq_prio(rq) : 0);
 
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
@@ -917,15 +930,19 @@ static void execlists_submission_tasklet
 		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
 }
 
-static void insert_request(struct intel_engine_cs *engine,
-			   struct i915_priotree *pt,
-			   int prio)
+static void queue_request(struct intel_engine_cs *engine,
+			  struct i915_priotree *pt,
+			  int prio)
 {
-	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
+	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
+}
 
-	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1))
+static void submit_queue(struct intel_engine_cs *engine, int prio)
+{
+	if (prio > engine->execlists.queue_priority) {
+		engine->execlists.queue_priority = prio;
 		tasklet_hi_schedule(&engine->execlists.tasklet);
+	}
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -936,7 +953,8 @@ static void execlists_submit_request(str
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
-	insert_request(engine, &request->priotree, request->priotree.priority);
+	queue_request(engine, &request->priotree, rq_prio(request));
+	submit_queue(engine, rq_prio(request));
 
 	GEM_BUG_ON(!engine->execlists.first);
 	GEM_BUG_ON(list_empty(&request->priotree.link));
@@ -992,7 +1010,7 @@ static void execlists_schedule(struct i9
 	 * static void update_priorities(struct i915_priotree *pt, prio) {
 	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
 	 *		update_priorities(dep->signal, prio)
-	 *	insert_request(pt);
+	 *	queue_request(pt);
 	 * }
 	 * but that may have unlimited recursion depth and so runs a very
 	 * real risk of overunning the kernel stack. Instead, we build
@@ -1055,8 +1073,9 @@ static void execlists_schedule(struct i9
 		pt->priority = prio;
 		if (!list_empty(&pt->link)) {
 			__list_del_entry(&pt->link);
-			insert_request(engine, pt, prio);
+			queue_request(engine, pt, prio);
 		}
+		submit_queue(engine, prio);
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -257,6 +257,16 @@ struct intel_engine_execlists {
 	unsigned int port_mask;
 
 	/**
+	 * @queue_priority: Highest pending priority.
+	 *
+	 * When we add requests into the queue, or adjust the priority of
+	 * executing requests, we compute the maximum priority of those
+	 * pending requests. We can then use this value to determine if
+	 * we need to preempt the executing requests to service the queue.
+	 */
+	int queue_priority;
+
+	/**
 	 * @queue: queue of requests, in priority lists
 	 */
 	struct rb_root queue;