From dc5ac599ebd166f7541f8cfaad2b664e1c81b5e7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 16 Dec 2019 14:24:09 +0000
Subject: drm/i915/gt: Tidy up full-ppgtt on Ivybridge
Git-commit: 902eb748e5c3d32ad955eef439e0911905758166
Patch-mainline: v5.6-rc1
References: jsc#SLE-12680, jsc#SLE-12880, jsc#SLE-12882, jsc#SLE-12883, jsc#SLE-13496, jsc#SLE-15322
With a couple more memory barriers dotted around the place we can
significantly reduce the MTBF on Ivybridge. Still doesn't really help
Haswell though.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191216142409.2605211-1-chris@chris-wilson.co.uk
Signed-off-by: Patrik Jakobsson <pjakobsson@suse.de>
---
.../gpu/drm/i915/gt/intel_ring_submission.c | 138 ++++++------------
drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +
2 files changed, 43 insertions(+), 97 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 77f78906f612..1b21a513521b 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -362,6 +362,12 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode)
*/
flags |= PIPE_CONTROL_CS_STALL;
+ /*
+ * CS_STALL suggests at least a post-sync write.
+ */
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+
/* Just flush everything. Experiments have shown that reducing the
* number of bits based on the write domains has little performance
* impact.
@@ -380,13 +386,6 @@ gen7_render_ring_flush(struct i915_request *rq, u32 mode)
flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
- /*
- * TLB invalidate requires a post-sync write.
- */
- flags |= PIPE_CONTROL_QW_WRITE;
- flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
-
- flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
/* Workaround: we must issue a pipe_control with CS-stall bit
* set before a pipe_control command that has the state cache
@@ -1371,50 +1370,26 @@ static int load_pd_dir(struct i915_request *rq,
const struct intel_engine_cs * const engine = rq->engine;
u32 *cs;
- cs = intel_ring_begin(rq, 12);
+ cs = intel_ring_begin(rq, 10);
if (IS_ERR(cs))
return PTR_ERR(cs);
- *cs++ = MI_LOAD_REGISTER_IMM(1);
+ *cs++ = MI_LOAD_REGISTER_IMM(3);
*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
*cs++ = valid;
-
- *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
- *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
- *cs++ = intel_gt_scratch_offset(rq->engine->gt,
- INTEL_GT_SCRATCH_FIELD_DEFAULT);
-
- *cs++ = MI_LOAD_REGISTER_IMM(1);
*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
+ *cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base));
+ *cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE);
/* Stall until the page table load is complete? */
*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
- *cs++ = intel_gt_scratch_offset(rq->engine->gt,
+ *cs++ = intel_gt_scratch_offset(engine->gt,
INTEL_GT_SCRATCH_FIELD_DEFAULT);
intel_ring_advance(rq, cs);
- return rq->engine->emit_flush(rq, EMIT_FLUSH);
-}
-
-static int flush_tlb(struct i915_request *rq)
-{
- const struct intel_engine_cs * const engine = rq->engine;
- u32 *cs;
-
- cs = intel_ring_begin(rq, 4);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- *cs++ = MI_LOAD_REGISTER_IMM(1);
- *cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base));
- *cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE);
-
- *cs++ = MI_NOOP;
- intel_ring_advance(rq, cs);
-
return 0;
}
@@ -1582,20 +1557,42 @@ static int remap_l3(struct i915_request *rq)
return 0;
}
+static int switch_mm(struct i915_request *rq, struct i915_address_space *vm)
+{
+ int ret;
+
+ if (!vm)
+ return 0;
+
+ ret = rq->engine->emit_flush(rq, EMIT_FLUSH);
+ if (ret)
+ return ret;
+
+ /*
+ * Not only do we need a full barrier (post-sync write) after
+ * invalidating the TLBs, but we need to wait a little bit
+ * longer. Whether this is merely delaying us, or the
+ * subsequent flush is a key part of serialising with the
+ * post-sync op, this extra pass appears vital before a
+ * mm switch!
+ */
+ ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G);
+ if (ret)
+ return ret;
+
+ return rq->engine->emit_flush(rq, EMIT_FLUSH);
+}
+
static int switch_context(struct i915_request *rq)
{
struct intel_context *ce = rq->hw_context;
- struct i915_address_space *vm = vm_alias(ce);
- u32 hw_flags = 0;
int ret;
GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
- if (vm) {
- ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm));
- if (ret)
- return ret;
- }
+ ret = switch_mm(rq, vm_alias(ce));
+ if (ret)
+ return ret;
if (ce->state) {
u32 flags;
@@ -1607,65 +1604,12 @@ static int switch_context(struct i915_request *rq)
BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN);
flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT;
- if (!rq->engine->default_state)
+ if (test_bit(CONTEXT_VALID_BIT, &ce->flags))
flags |= MI_RESTORE_EXT_STATE_EN;
else
flags |= MI_RESTORE_INHIBIT;
ret = mi_set_context(rq, flags);
-
- ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm));
- if (ret)
- return ret;
-
- ret = engine->emit_flush(rq, EMIT_INVALIDATE);
- if (ret)
- return ret;
-
- ret = flush_pd_dir(rq);
- if (ret)
- return ret;
-
- /*
- * Not only do we need a full barrier (post-sync write) after
- * invalidating the TLBs, but we need to wait a little bit
- * longer. Whether this is merely delaying us, or the
- * subsequent flush is a key part of serialising with the
- * post-sync op, this extra pass appears vital before a
- * mm switch!
- */
- ret = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
- if (ret)
- return ret;
-
- ret = flush_tlb(rq);
- if (ret)
- return ret;
-
- ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), 0);
- if (ret)
- return ret;
-
- ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G);
- if (ret)
- return ret;
-
- ret = flush_tlb(rq);
- if (ret)
- return ret;
-
- ret = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
- if (ret)
- return ret;
- }
-
- if (ce->state) {
- GEM_BUG_ON(rq->engine->id != RCS0);
-
- if (!test_bit(CONTEXT_VALID_BIT, &ce->flags))
- hw_flags = MI_RESTORE_INHIBIT;
-
- ret = mi_set_context(rq, hw_flags);
if (ret)
return ret;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 24de86cc29ca..2c706cf7bdc7 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1711,8 +1711,10 @@ static void gen6_flush_pd(struct gen6_ppgtt *ppgtt, u64 start, u64 end)
gen6_for_each_pde(pt, pd, start, end, pde)
gen6_write_pde(ppgtt, pde, pt);
+ mb();
ioread32(ppgtt->pd_addr + pde - 1);
gen6_ggtt_invalidate(ppgtt->base.vm.gt->ggtt);
+ mb();
mutex_unlock(&ppgtt->flush);
}
--
2.28.0