Skip to content

Commit

Permalink
drm/xe: Introduce a simple wedged state
Browse files Browse the repository at this point in the history
Introduce a very simple 'wedged' state where any attempt
to access the GPU is entirely blocked.

On some critical cases, like on gt_reset failure, we need to
block any other attempt to use the GPU. Otherwise we are at
a risk of reaching cases that would force us to reboot the machine.

So, when this cases are identified we corner and block any GPU
access. No IOCTL and not even another GT reset should be attempted.

The 'wedged' state in Xe is an end state with no way back.
Only a device "re-probe" (unbind + bind) can restore the GPU access.

v2: - s/wedged/busted (Lucas)
    - use unbind+bind instead of module reload (Lucas)
    - added more info on unbind operations and instruction on bug report
    - only print the message once.

v3: - s/busted/wedged (Ashutosh, Tvrtko, Thomas)
    - don't assume user has sudo and tee available (Lucas)

v4: - remove unnecessary cases around ct communication or migration.

Cc: Ashutosh Dixit <[email protected]>
Cc: Tvrtko Ursulin <[email protected]>
Cc: Thomas Hellström <[email protected]>
Cc: Lucas De Marchi <[email protected]>
Cc: Anshuman Gupta <[email protected]>
Reviewed-by: Himal Prasad Ghimiray <[email protected]>
Reviewed-by: Lucas De Marchi <[email protected]> #v2
Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
Signed-off-by: Rodrigo Vivi <[email protected]>
  • Loading branch information
rodrigovivi committed Apr 24, 2024
1 parent c8d4524 commit fb74b20
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 1 deletion.
6 changes: 6 additions & 0 deletions drivers/gpu/drm/xe/xe_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ static long xe_drm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct xe_device *xe = to_xe_device(file_priv->minor->dev);
long ret;

if (xe_device_wedged(xe))
return -ECANCELED;

ret = xe_pm_runtime_get_ioctl(xe);
if (ret >= 0)
ret = drm_ioctl(file, cmd, arg);
Expand All @@ -152,6 +155,9 @@ static long xe_drm_compat_ioctl(struct file *file, unsigned int cmd, unsigned lo
struct xe_device *xe = to_xe_device(file_priv->minor->dev);
long ret;

if (xe_device_wedged(xe))
return -ECANCELED;

ret = xe_pm_runtime_get_ioctl(xe);
if (ret >= 0)
ret = drm_compat_ioctl(file, cmd, arg);
Expand Down
20 changes: 20 additions & 0 deletions drivers/gpu/drm/xe/xe_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,24 @@ void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
u64 xe_device_canonicalize_addr(struct xe_device *xe, u64 address);
u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);

static inline bool xe_device_wedged(struct xe_device *xe)
{
return atomic_read(&xe->wedged);
}

static inline void xe_device_declare_wedged(struct xe_device *xe)
{
if (!atomic_xchg(&xe->wedged, 1)) {
xe->needs_flr_on_fini = true;
drm_err(&xe->drm,
"CRITICAL: Xe has declared device %s as wedged.\n"
"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
dev_name(xe->drm.dev), dev_name(xe->drm.dev),
dev_name(xe->drm.dev));
}
}

#endif
3 changes: 3 additions & 0 deletions drivers/gpu/drm/xe/xe_device_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,9 @@ struct xe_device {
/** @needs_flr_on_fini: requests function-reset on fini */
bool needs_flr_on_fini;

/** @wedged: Xe device faced a critical error and is now blocked. */
atomic_t wedged;

/* private: */

#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
Expand Down
5 changes: 4 additions & 1 deletion drivers/gpu/drm/xe/xe_gt.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,9 @@ static int gt_reset(struct xe_gt *gt)
{
int err;

if (xe_device_wedged(gt_to_xe(gt)))
return -ECANCELED;

/* We only support GT resets with GuC submission */
if (!xe_device_uc_enabled(gt_to_xe(gt)))
return -ENODEV;
Expand Down Expand Up @@ -685,7 +688,7 @@ static int gt_reset(struct xe_gt *gt)
err_fail:
xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));

gt_to_xe(gt)->needs_flr_on_fini = true;
xe_device_declare_wedged(gt_to_xe(gt));

return err;
}
Expand Down
3 changes: 3 additions & 0 deletions drivers/gpu/drm/xe/xe_guc_pc.c
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,9 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg)
return;
}

if (xe_device_wedged(xe))
return;

XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL));
XE_WARN_ON(xe_guc_pc_gucrc_disable(pc));
XE_WARN_ON(xe_guc_pc_stop(pc));
Expand Down

0 comments on commit fb74b20

Please sign in to comment.