From 547c7a8b30431215a4a35c1fe277d2c08dd95188 Mon Sep 17 00:00:00 2001
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Date: Mon, 25 Apr 2022 17:30:45 -0700
Subject: i915/guc/reset: Make __guc_reset_context aware of guilty engines
Git-commit: 89e96d822bd51f7afe2d3e95a34099480b5c3d55
Patch-mainline: v5.18
References: jsc#PED-1166 jsc#PED-1168 jsc#PED-1170 jsc#PED-1218 jsc#PED-1220 jsc#PED-1222 jsc#PED-1223 jsc#PED-1225
There are 2 ways an engine can get reset in i915 and the method of reset
affects how KMD labels a context as guilty/innocent.
(1) GuC initiated engine-reset: GuC resets a hung engine and notifies
KMD. The context that hung on the engine is marked guilty and all other
contexts are innocent. The innocent contexts are resubmitted.
(2) GT based reset: When an engine heartbeat fails to tick, KMD
initiates a gt/chip reset. All active contexts are marked as guilty and
discarded.
In order to correctly mark the contexts as guilty/innocent, pass a mask
of engines that were reset to __guc_reset_context.
Fixes: eb5e7da736f3 ("drm/i915/guc: Reset implementation for new GuC interface")
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220426003045.3929439-1-umesh.nerlige.ramappa@intel.com
(cherry picked from commit 303760aa914b7f5ac9602dbb4b471a2ad52eeb3e)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Patrik Jakobsson <pjakobsson@suse.de>
---
drivers/gpu/drm/i915/gt/intel_reset.c | 2 +-
drivers/gpu/drm/i915/gt/uc/intel_guc.h | 2 +-
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 16 ++++++++--------
drivers/gpu/drm/i915/gt/uc/intel_uc.c | 2 +-
drivers/gpu/drm/i915/gt/uc/intel_uc.h | 2 +-
5 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 82713264b96c..b7c6d4462ec5 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -806,7 +806,7 @@ static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
__intel_engine_reset(engine, stalled_mask & engine->mask);
local_bh_enable();
- intel_uc_reset(>->uc, true);
+ intel_uc_reset(>->uc, ALL_ENGINES);
intel_ggtt_restore_fences(gt->ggtt);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index bf7079480d47..2488d1197f3e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -438,7 +438,7 @@ int intel_guc_global_policies_update(struct intel_guc *guc);
void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq);
void intel_guc_submission_reset_prepare(struct intel_guc *guc);
-void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled);
void intel_guc_submission_reset_finish(struct intel_guc *guc);
void intel_guc_submission_cancel_requests(struct intel_guc *guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 1ce7e04aa837..28f9aac0201d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1590,9 +1590,9 @@ __unwind_incomplete_requests(struct intel_context *ce)
spin_unlock_irqrestore(&sched_engine->lock, flags);
}
-static void __guc_reset_context(struct intel_context *ce, bool stalled)
+static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t stalled)
{
- bool local_stalled;
+ bool guilty;
struct i915_request *rq;
unsigned long flags;
u32 head;
@@ -1620,7 +1620,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
if (!intel_context_is_pinned(ce))
goto next_context;
- local_stalled = false;
+ guilty = false;
rq = intel_context_find_active_request(ce);
if (!rq) {
head = ce->ring->tail;
@@ -1628,14 +1628,14 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
}
if (i915_request_started(rq))
- local_stalled = true;
+ guilty = stalled & ce->engine->mask;
GEM_BUG_ON(i915_active_is_idle(&ce->active));
head = intel_ring_wrap(ce->ring, rq->head);
- __i915_request_reset(rq, local_stalled && stalled);
+ __i915_request_reset(rq, guilty);
out_replay:
- guc_reset_state(ce, head, local_stalled && stalled);
+ guc_reset_state(ce, head, guilty);
next_context:
if (i != number_children)
ce = list_next_entry(ce, parallel.child_link);
@@ -1645,7 +1645,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
intel_context_put(parent);
}
-void intel_guc_submission_reset(struct intel_guc *guc, bool stalled)
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
{
struct intel_context *ce;
unsigned long index;
@@ -4013,7 +4013,7 @@ static void guc_context_replay(struct intel_context *ce)
{
struct i915_sched_engine *sched_engine = ce->engine->sched_engine;
- __guc_reset_context(ce, true);
+ __guc_reset_context(ce, ce->engine->mask);
tasklet_hi_schedule(&sched_engine->tasklet);
}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index da199aa6989f..8eb34de2f20c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -593,7 +593,7 @@ void intel_uc_reset_prepare(struct intel_uc *uc)
__uc_sanitize(uc);
}
-void intel_uc_reset(struct intel_uc *uc, bool stalled)
+void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled)
{
struct intel_guc *guc = &uc->guc;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.h b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
index 866b462821c0..a8f38c2c60e2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.h
@@ -42,7 +42,7 @@ void intel_uc_driver_late_release(struct intel_uc *uc);
void intel_uc_driver_remove(struct intel_uc *uc);
void intel_uc_init_mmio(struct intel_uc *uc);
void intel_uc_reset_prepare(struct intel_uc *uc);
-void intel_uc_reset(struct intel_uc *uc, bool stalled);
+void intel_uc_reset(struct intel_uc *uc, intel_engine_mask_t stalled);
void intel_uc_reset_finish(struct intel_uc *uc);
void intel_uc_cancel_requests(struct intel_uc *uc);
void intel_uc_suspend(struct intel_uc *uc);
--
2.38.1