Blob Blame History Raw
From 6acbea89efbdc6aef56bc6f364d4d8b72a5fc145 Mon Sep 17 00:00:00 2001
From: Michel Thierry <michel.thierry@intel.com>
Date: Tue, 31 Oct 2017 15:53:09 -0700
Subject: [PATCH] drm/i915/guc: Add support for reset engine using GuC commands
Git-commit: 6acbea89efbdc6aef56bc6f364d4d8b72a5fc145
Patch-mainline: v4.16-rc1
References: FATE#322643 bsc#1055900

This patch adds per engine reset and recovery (TDR) support when GuC is
used to submit workloads to GPU.

In the case of i915 directly submission to ELSP, driver manages hang
detection, recovery and resubmission. With GuC submission these tasks
are shared between driver and GuC. i915 is still responsible for detecting
a hang, and when it does it only requests GuC to reset that Engine. GuC
internally manages acquiring forcewake and idling the engine before
resetting it.

Once the reset is successful, i915 takes over again and handles the
resubmission. The scheduler in i915 knows which requests are pending so
after resetting a engine, pending workloads/requests are resubmitted
again.

V2: s/i915_guc_request_engine_reset/i915_guc_reset_engine/ to match the
non-guc function names.

V3: Removed debug message about engine restarting from which request,
since the new baseline do it regardless of submission mode. (Chris)

V4: Rebase.

V5: Do not pass unnecessary reporting flags to the fw (Jeff);
tasklet_schedule(&execlists->irq_tasklet) handles the resubmit; rebase.

V6: Rename the existing reset engine function and share a similar
interface between guc and non-guc paths (Chris).

Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20171031225309.10888-1-michel.thierry@intel.com
Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Takashi Iwai <tiwai@suse.de>

---
 drivers/gpu/drm/i915/i915_drv.c       |   14 ++++++++++++--
 drivers/gpu/drm/i915/i915_drv.h       |    2 ++
 drivers/gpu/drm/i915/intel_guc.c      |   24 ++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_guc_fwif.h |    1 +
 drivers/gpu/drm/i915/intel_uncore.c   |    5 -----
 5 files changed, 39 insertions(+), 7 deletions(-)

--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1951,6 +1951,12 @@ error:
 	goto finish;
 }
 
+static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
+					struct intel_engine_cs *engine)
+{
+	return intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+}
+
 /**
  * i915_reset_engine - reset GPU engine to recover from a hang
  * @engine: engine to reset
@@ -1985,10 +1991,14 @@ int i915_reset_engine(struct intel_engin
 		goto out;
 	}
 
-	ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine));
+	if (!engine->i915->guc.execbuf_client)
+		ret = intel_gt_reset_engine(engine->i915, engine);
+	else
+		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
 	if (ret) {
 		/* If we fail here, we expect to fallback to a global reset */
-		DRM_DEBUG_DRIVER("Failed to reset %s, ret=%d\n",
+		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
+				 engine->i915->guc.execbuf_client ? "GuC " : "",
 				 engine->name, ret);
 		goto out;
 	}
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3328,6 +3328,8 @@ extern int i915_reset_engine(struct inte
 
 extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_reset_guc(struct drm_i915_private *dev_priv);
+extern int intel_guc_reset_engine(struct intel_guc *guc,
+				  struct intel_engine_cs *engine);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
--- a/drivers/gpu/drm/i915/intel_guc.c
+++ b/drivers/gpu/drm/i915/intel_guc.c
@@ -24,6 +24,7 @@
 
 #include "intel_guc.h"
 #include "i915_drv.h"
+#include "i915_guc_submission.h"
 
 static void gen8_guc_raise_irq(struct intel_guc *guc)
 {
@@ -282,6 +283,29 @@ int intel_guc_suspend(struct drm_i915_pr
 
 	return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
+
+/**
+ * intel_guc_reset_engine() - ask GuC to reset an engine
+ * @guc:	intel_guc structure
+ * @engine:	engine to be reset
+ */
+int intel_guc_reset_engine(struct intel_guc *guc,
+			   struct intel_engine_cs *engine)
+{
+	u32 data[7];
+
+	GEM_BUG_ON(!guc->execbuf_client);
+
+	data[0] = INTEL_GUC_ACTION_REQUEST_ENGINE_RESET;
+	data[1] = engine->guc_id;
+	data[2] = 0;
+	data[3] = 0;
+	data[4] = 0;
+	data[5] = guc->execbuf_client->stage_id;
+	data[6] = guc_ggtt_offset(guc->shared_data);
+
+	return intel_guc_send(guc, data, ARRAY_SIZE(data));
+}
 
 /**
  * intel_guc_resume() - notify GuC resuming from suspend state
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -574,6 +574,7 @@ struct guc_shared_ctx_data {
 enum intel_guc_action {
 	INTEL_GUC_ACTION_DEFAULT = 0x0,
 	INTEL_GUC_ACTION_REQUEST_PREEMPTION = 0x2,
+	INTEL_GUC_ACTION_REQUEST_ENGINE_RESET = 0x3,
 	INTEL_GUC_ACTION_SAMPLE_FORCEWAKE = 0x6,
 	INTEL_GUC_ACTION_ALLOCATE_DOORBELL = 0x10,
 	INTEL_GUC_ACTION_DEALLOCATE_DOORBELL = 0x20,
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1805,14 +1805,9 @@ bool intel_has_gpu_reset(struct drm_i915
 	return intel_get_gpu_reset(dev_priv) != NULL;
 }
 
-/*
- * When GuC submission is enabled, GuC manages ELSP and can initiate the
- * engine reset too. For now, fall back to full GPU reset if it is enabled.
- */
 bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
 {
 	return (dev_priv->info.has_reset_engine &&
-		!dev_priv->guc.execbuf_client &&
 		i915_modparams.reset >= 2);
 }