From d1d1ebf4120db07a6706faed970be7460303834e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 21 Jul 2017 13:32:33 +0100 Subject: [PATCH] drm/i915: Don't touch fence->error when resetting an innocent request Git-commit: d1d1ebf4120db07a6706faed970be7460303834e Patch-mainline: v4.14-rc1 References: FATE#322643 bsc#1055900 If the request has been completed before the reset took effect, we don't need to mark it up as being a victim. Touching fence->error after the fence has been signaled is detected by dma_fence_set_error() and triggers a BUG: [ 231.743133] kernel BUG at ./include/linux/dma-fence.h:434! [ 231.743156] invalid opcode: 0000 [#1] SMP KASAN [ 231.743172] Modules linked in: i915 drm_kms_helper drm iptable_nat nf_nat_ipv4 nf_nat x86_pkg_temp_thermal iosf_mbi i2c_algo_bit cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea fb font fbdev [last unloaded: drm] [ 231.743221] CPU: 2 PID: 20 Comm: kworker/2:0 Tainted: G U 4.13.0-rc1+ #52 [ 231.743236] Hardware name: Hewlett-Packard HP EliteBook 8460p/161C, BIOS 68SCF Ver. F.01 03/11/2011 [ 231.743363] Workqueue: events_long i915_hangcheck_elapsed [i915] [ 231.743382] task: ffff8801f42e9780 task.stack: ffff8801f42f8000 [ 231.743489] RIP: 0010:i915_gem_reset_engine+0x45a/0x460 [i915] [ 231.743505] RSP: 0018:ffff8801f42ff770 EFLAGS: 00010202 [ 231.743521] RAX: 0000000000000007 RBX: ffff8801bf6b1880 RCX: ffffffffa02881a6 [ 231.743537] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff8801bf6b18c8 [ 231.743551] RBP: ffff8801f42ff7c8 R08: 0000000000000001 R09: 0000000000000000 [ 231.743566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801edb02d00 [ 231.743581] R13: ffff8801e19d4200 R14: 000000000000001d R15: ffff8801ce2a4000 [ 231.743599] FS: 0000000000000000(0000) GS:ffff8801f5a80000(0000) knlGS:0000000000000000 [ 231.743614] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 231.743629] CR2: 00007f0ebd1add10 CR3: 0000000002621000 CR4: 00000000000406e0 [ 231.743643] Call Trace: [ 231.743752] i915_gem_reset+0x6c/0x150 [i915] [ 231.743853] i915_reset+0x175/0x210 [i915] [ 231.743958] i915_reset_device+0x33b/0x350 [i915] [ 231.744061] ? valleyview_pipestat_irq_handler+0xe0/0xe0 [i915] [ 231.744081] ? trace_hardirqs_off_caller+0x70/0x110 [ 231.744102] ? _raw_spin_unlock_irqrestore+0x46/0x50 [ 231.744120] ? find_held_lock+0x119/0x150 [ 231.744138] ? mark_lock+0x6d/0x850 [ 231.744241] ? gen8_gt_irq_ack+0x1f0/0x1f0 [i915] [ 231.744262] ? work_on_cpu_safe+0x60/0x60 [ 231.744284] ? rcu_read_lock_sched_held+0x57/0xa0 [ 231.744400] ? gen6_read32+0x2ba/0x320 [i915] [ 231.744506] i915_handle_error+0x382/0x5f0 [i915] [ 231.744611] ? gen6_rps_reset_ei+0x20/0x20 [i915] [ 231.744630] ? vsnprintf+0x128/0x8e0 [ 231.744649] ? pointer+0x6b0/0x6b0 [ 231.744667] ? debug_check_no_locks_freed+0x1a0/0x1a0 [ 231.744688] ? scnprintf+0x92/0xe0 [ 231.744706] ? snprintf+0xb0/0xb0 [ 231.744820] hangcheck_declare_hang+0x15a/0x1a0 [i915] [ 231.744932] ? engine_stuck+0x440/0x440 [i915] [ 231.744951] ? rcu_read_lock_sched_held+0x57/0xa0 [ 231.745062] ? gen6_read32+0x2ba/0x320 [i915] [ 231.745173] ? gen6_read16+0x320/0x320 [i915] [ 231.745284] ? intel_engine_get_active_head+0x91/0x170 [i915] [ 231.745401] i915_hangcheck_elapsed+0x3d8/0x400 [i915] [ 231.745424] process_one_work+0x3e8/0xac0 [ 231.745444] ? pwq_dec_nr_in_flight+0x110/0x110 [ 231.745464] ? do_raw_spin_lock+0x8e/0x120 [ 231.745484] worker_thread+0x8d/0x720 [ 231.745506] kthread+0x19e/0x1f0 [ 231.745524] ? process_one_work+0xac0/0xac0 [ 231.745541] ? kthread_create_on_node+0xa0/0xa0 [ 231.745560] ret_from_fork+0x27/0x40 [ 231.745581] Code: 8b 7d c8 e8 49 0d 02 e1 49 8b 7f 38 48 8b 75 b8 48 83 c7 10 e8 b8 89 be e1 e9 95 fc ff ff 4c 89 e7 e8 4b b9 ff ff e9 30 ff ff ff <0f> 0b 0f 1f 40 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fe [ 231.745767] RIP: i915_gem_reset_engine+0x45a/0x460 [i915] RSP: ffff8801f42ff770 At first glance this looks to be related to commit c64992e035d7 ("drm/i915: Look for active requests earlier in the reset path"), but it could easily happen before as well. On the other hand, we no longer logged victims due to the active_request being dropped earlier. V2: Be trickier to unwind the incomplete request as we cannot rely on request retirement for the lockless per-engine reset. V3: Reprobe the active request at the time of the reset. Reported-by: Daniel Vetter Fixes: c64992e035d7 ("drm/i915: Look for active requests earlier in the reset path") Signed-off-by: Chris Wilson Cc: Michel Thierry Cc: Mika Kuoppala Cc: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20170721123238.16428-15-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen #v1 Reviewed-by: Michel Thierry Signed-off-by: Daniel Vetter Acked-by: Takashi Iwai --- drivers/gpu/drm/i915/i915_gem.c | 53 +++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 19 deletions(-) --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2854,11 +2854,9 @@ i915_gem_reset_prepare_engine(struct int if (engine->irq_seqno_barrier) engine->irq_seqno_barrier(engine); - if (engine_stalled(engine)) { - request = i915_gem_find_active_request(engine); - if (request && request->fence.error == -EIO) - request = ERR_PTR(-EIO); /* Previous reset failed! */ - } + request = i915_gem_find_active_request(engine); + if (request && request->fence.error == -EIO) + request = ERR_PTR(-EIO); /* Previous reset failed! */ return request; } @@ -2927,12 +2925,11 @@ static void engine_skip_context(struct d spin_unlock_irqrestore(&engine->timeline->lock, flags); } -/* Returns true if the request was guilty of hang */ -static bool i915_gem_reset_request(struct drm_i915_gem_request *request) +/* Returns the request if it was guilty of the hang */ +static struct drm_i915_gem_request * +i915_gem_reset_request(struct intel_engine_cs *engine, + struct drm_i915_gem_request *request) { - /* Read once and return the resolution */ - const bool guilty = !i915_gem_request_completed(request); - /* The guilty request will get skipped on a hung engine. * * Users of client default contexts do not rely on logical @@ -2954,15 +2951,34 @@ static bool i915_gem_reset_request(struc * subsequent hangs. */ - if (guilty) { + if (engine_stalled(engine)) { i915_gem_context_mark_guilty(request->ctx); skip_request(request); + + /* If this context is now banned, skip all pending requests. */ + if (i915_gem_context_is_banned(request->ctx)) + engine_skip_context(request); } else { - i915_gem_context_mark_innocent(request->ctx); - dma_fence_set_error(&request->fence, -EAGAIN); + /* + * Since this is not the hung engine, it may have advanced + * since the hang declaration. Double check by refinding + * the active request at the time of the reset. + */ + request = i915_gem_find_active_request(engine); + if (request) { + i915_gem_context_mark_innocent(request->ctx); + dma_fence_set_error(&request->fence, -EAGAIN); + + /* Rewind the engine to replay the incomplete rq */ + spin_lock_irq(&engine->timeline->lock); + request = list_prev_entry(request, link); + if (&request->link == &engine->timeline->requests) + request = NULL; + spin_unlock_irq(&engine->timeline->lock); + } } - return guilty; + return request; } void i915_gem_reset_engine(struct intel_engine_cs *engine, @@ -2970,13 +2986,12 @@ void i915_gem_reset_engine(struct intel_ { engine->irq_posted = 0; - if (request && i915_gem_reset_request(request)) { + if (request) + request = i915_gem_reset_request(engine, request); + + if (request) { DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", engine->name, request->global_seqno); - - /* If this context is now banned, skip all pending requests. */ - if (i915_gem_context_is_banned(request->ctx)) - engine_skip_context(request); } /* Setup the CS to resume from the breadcrumb of the hung request */