Skip to content

Commit c9474b7

Browse files
mbrost05rodrigovivi
authored andcommitted
drm/xe: Wedge the entire device
Wedge the entire device, not just GT which may have triggered the wedge. To implement this, cleanup the layering so xe_device_declare_wedged() calls into the lower layers (GT) to ensure entire device is wedged. While we are here, also signal any pending GT TLB invalidations upon wedging device. Lastly, short circuit reset wait if device is wedged. v2: - Short circuit reset wait if device is wedged (Local testing) Fixes: 8ed9aaa ("drm/xe: Force wedged state and block GT reset upon any GPU hang") Cc: Rodrigo Vivi <[email protected]> Signed-off-by: Matthew Brost <[email protected]> Reviewed-by: Jonathan Cavitt <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 7dbe8af) Signed-off-by: Rodrigo Vivi <[email protected]>
1 parent bf07ca9 commit c9474b7

File tree

9 files changed

+80
-13
lines changed

9 files changed

+80
-13
lines changed

drivers/gpu/drm/xe/xe_device.c

+6
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,9 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
870870
*/
871871
void xe_device_declare_wedged(struct xe_device *xe)
872872
{
873+
struct xe_gt *gt;
874+
u8 id;
875+
873876
if (xe->wedged.mode == 0) {
874877
drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n");
875878
return;
@@ -883,4 +886,7 @@ void xe_device_declare_wedged(struct xe_device *xe)
883886
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
884887
dev_name(xe->drm.dev));
885888
}
889+
890+
for_each_gt(gt, xe, id)
891+
xe_gt_declare_wedged(gt);
886892
}

drivers/gpu/drm/xe/xe_gt.c

+15
Original file line numberDiff line numberDiff line change
@@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt)
904904

905905
return NULL;
906906
}
907+
908+
/**
909+
* xe_gt_declare_wedged() - Declare GT wedged
910+
* @gt: the GT object
911+
*
912+
* Wedge the GT which stops all submission, saves desired debug state, and
913+
* cleans up anything which could timeout.
914+
*/
915+
void xe_gt_declare_wedged(struct xe_gt *gt)
916+
{
917+
xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode);
918+
919+
xe_uc_declare_wedged(&gt->uc);
920+
xe_gt_tlb_invalidation_reset(gt);
921+
}

drivers/gpu/drm/xe/xe_gt.h

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
3737
int xe_gt_init_hwconfig(struct xe_gt *gt);
3838
int xe_gt_init_early(struct xe_gt *gt);
3939
int xe_gt_init(struct xe_gt *gt);
40+
void xe_gt_declare_wedged(struct xe_gt *gt);
4041
int xe_gt_record_default_lrcs(struct xe_gt *gt);
4142

4243
/**

drivers/gpu/drm/xe/xe_guc.c

+16
Original file line numberDiff line numberDiff line change
@@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
11781178
xe_guc_ct_print(&guc->ct, p, false);
11791179
xe_guc_submit_print(guc, p);
11801180
}
1181+
1182+
/**
1183+
* xe_guc_declare_wedged() - Declare GuC wedged
1184+
* @guc: the GuC object
1185+
*
1186+
* Wedge the GuC which stops all submission, saves desired debug state, and
1187+
* cleans up anything which could timeout.
1188+
*/
1189+
void xe_guc_declare_wedged(struct xe_guc *guc)
1190+
{
1191+
xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
1192+
1193+
xe_guc_reset_prepare(guc);
1194+
xe_guc_ct_stop(&guc->ct);
1195+
xe_guc_submit_wedge(guc);
1196+
}

drivers/gpu/drm/xe/xe_guc.h

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
3737
void xe_guc_stop_prepare(struct xe_guc *guc);
3838
void xe_guc_stop(struct xe_guc *guc);
3939
int xe_guc_start(struct xe_guc *guc);
40+
void xe_guc_declare_wedged(struct xe_guc *guc);
4041

4142
static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
4243
{

drivers/gpu/drm/xe/xe_guc_submit.c

+25-13
Original file line numberDiff line numberDiff line change
@@ -861,36 +861,47 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
861861
xe_sched_tdr_queue_imm(&q->guc->sched);
862862
}
863863

864-
static bool guc_submit_hint_wedged(struct xe_guc *guc)
864+
/**
865+
* xe_guc_submit_wedge() - Wedge GuC submission
866+
* @guc: the GuC object
867+
*
868+
* Save exec queue's registered with GuC state by taking a ref to each queue.
869+
* Register a DRMM handler to drop refs upon driver unload.
870+
*/
871+
void xe_guc_submit_wedge(struct xe_guc *guc)
865872
{
866873
struct xe_device *xe = guc_to_xe(guc);
867874
struct xe_exec_queue *q;
868875
unsigned long index;
869876
int err;
870877

871-
if (xe->wedged.mode != 2)
872-
return false;
873-
874-
if (xe_device_wedged(xe))
875-
return true;
876-
877-
xe_device_declare_wedged(xe);
878-
879-
xe_guc_submit_reset_prepare(guc);
880-
xe_guc_ct_stop(&guc->ct);
878+
xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
881879

882880
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
883881
guc_submit_wedged_fini, guc);
884882
if (err) {
885883
drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
886-
return true; /* Device is wedged anyway */
884+
return;
887885
}
888886

889887
mutex_lock(&guc->submission_state.lock);
890888
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
891889
if (xe_exec_queue_get_unless_zero(q))
892890
set_exec_queue_wedged(q);
893891
mutex_unlock(&guc->submission_state.lock);
892+
}
893+
894+
static bool guc_submit_hint_wedged(struct xe_guc *guc)
895+
{
896+
struct xe_device *xe = guc_to_xe(guc);
897+
898+
if (xe->wedged.mode != 2)
899+
return false;
900+
901+
if (xe_device_wedged(xe))
902+
return true;
903+
904+
xe_device_declare_wedged(xe);
894905

895906
return true;
896907
}
@@ -1677,7 +1688,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
16771688

16781689
void xe_guc_submit_reset_wait(struct xe_guc *guc)
16791690
{
1680-
wait_event(guc->ct.wq, !guc_read_stopped(guc));
1691+
wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
1692+
!guc_read_stopped(guc));
16811693
}
16821694

16831695
void xe_guc_submit_stop(struct xe_guc *guc)

drivers/gpu/drm/xe/xe_guc_submit.h

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
1818
void xe_guc_submit_reset_wait(struct xe_guc *guc);
1919
void xe_guc_submit_stop(struct xe_guc *guc);
2020
int xe_guc_submit_start(struct xe_guc *guc);
21+
void xe_guc_submit_wedge(struct xe_guc *guc);
2122

2223
int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
2324
int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len);

drivers/gpu/drm/xe/xe_uc.c

+14
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,17 @@ void xe_uc_remove(struct xe_uc *uc)
300300
{
301301
xe_gsc_remove(&uc->gsc);
302302
}
303+
304+
/**
305+
* xe_uc_declare_wedged() - Declare UC wedged
306+
* @uc: the UC object
307+
*
308+
* Wedge the UC which stops all submission, saves desired debug state, and
309+
* cleans up anything which could timeout.
310+
*/
311+
void xe_uc_declare_wedged(struct xe_uc *uc)
312+
{
313+
xe_gt_assert(uc_to_gt(uc), uc_to_xe(uc)->wedged.mode);
314+
315+
xe_guc_declare_wedged(&uc->guc);
316+
}

drivers/gpu/drm/xe/xe_uc.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,6 @@ int xe_uc_start(struct xe_uc *uc);
2121
int xe_uc_suspend(struct xe_uc *uc);
2222
int xe_uc_sanitize_reset(struct xe_uc *uc);
2323
void xe_uc_remove(struct xe_uc *uc);
24+
void xe_uc_declare_wedged(struct xe_uc *uc);
2425

2526
#endif

0 commit comments

Comments
 (0)