Skip to content

Commit afefd6f

Browse files
srishanmalexdeucher
authored andcommitted
drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization
This commit introduces the Enforce Isolation Handler designed to enforce shader isolation on AMD GPUs, which helps to prevent data leakage between different processes. The handler counts the number of emitted fences for each GFX and compute ring. If there are any fences, it schedules the `enforce_isolation_work` to be run after a delay of `GFX_SLICE_PERIOD`. If there are no fences, it signals the Kernel Fusion Driver (KFD) to resume the runqueue. The function is synchronized using the `enforce_isolation_mutex`. This commit also introduces a reference count mechanism (kfd_sch_req_count) to keep track of the number of requests to enable the KFD scheduler. When a request to enable the KFD scheduler is made, the reference count is decremented. When the reference count reaches zero, a delayed work is scheduled to enforce isolation after a delay of GFX_SLICE_PERIOD. When a request to disable the KFD scheduler is made, the function first checks if the reference count is zero. If it is, it cancels the delayed work for enforcing isolation and checks if the KFD scheduler is active. If the KFD scheduler is active, it sends a request to stop the KFD scheduler and sets the KFD scheduler state to inactive. Then, it increments the reference count. The function is synchronized using the kfd_sch_mutex to ensure that the KFD scheduler state and reference count are updated atomically. Cc: Christian König <[email protected]> Cc: Alex Deucher <[email protected]> Signed-off-by: Alex Deucher <[email protected]> Signed-off-by: Srinivasan Shanmugam <[email protected]> Suggested-by: Christian König <[email protected]> Suggested-by: Alex Deucher <[email protected]>
1 parent 234eebe commit afefd6f

File tree

4 files changed

+200
-0
lines changed

4 files changed

+200
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

+2
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@
118118

119119
#define MAX_GPU_INSTANCE 64
120120

121+
#define GFX_SLICE_PERIOD msecs_to_jiffies(250)
122+
121123
struct amdgpu_gpu_instance {
122124
struct amdgpu_device *adev;
123125
int mgpu_fan_enabled;

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+16
Original file line numberDiff line numberDiff line change
@@ -4067,6 +4067,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
40674067
mutex_init(&adev->gfx.reset_sem_mutex);
40684068
/* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
40694069
mutex_init(&adev->enforce_isolation_mutex);
4070+
mutex_init(&adev->gfx.kfd_sch_mutex);
40704071

40714072
amdgpu_device_init_apu_flags(adev);
40724073

@@ -4098,6 +4099,21 @@ int amdgpu_device_init(struct amdgpu_device *adev,
40984099
amdgpu_device_delayed_init_work_handler);
40994100
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
41004101
amdgpu_device_delay_enable_gfx_off);
4102+
/*
4103+
* Initialize the enforce_isolation work structures for each XCP
4104+
* partition. This work handler is responsible for enforcing shader
4105+
* isolation on AMD GPUs. It counts the number of emitted fences for
4106+
* each GFX and compute ring. If there are any fences, it schedules
4107+
* the `enforce_isolation_work` to be run after a delay. If there are
4108+
* no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4109+
* runqueue.
4110+
*/
4111+
for (i = 0; i < MAX_XCP; i++) {
4112+
INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4113+
amdgpu_gfx_enforce_isolation_handler);
4114+
adev->gfx.enforce_isolation[i].adev = adev;
4115+
adev->gfx.enforce_isolation[i].xcp_id = i;
4116+
}
41014117

41024118
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
41034119

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

+167
Original file line numberDiff line numberDiff line change
@@ -1686,3 +1686,170 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
16861686
memcpy_toio(adev->gfx.cleaner_shader_cpu_ptr, cleaner_shader_ptr,
16871687
cleaner_shader_size);
16881688
}
1689+
1690+
/**
1691+
* amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
1692+
* @adev: amdgpu_device pointer
1693+
* @idx: Index of the scheduler to control
1694+
* @enable: Whether to enable or disable the KFD scheduler
1695+
*
1696+
* This function is used to control the KFD (Kernel Fusion Driver) scheduler
1697+
* from the KGD. It is part of the cleaner shader feature. This function plays
1698+
* a key role in enforcing process isolation on the GPU.
1699+
*
1700+
* The function uses a reference count mechanism (kfd_sch_req_count) to keep
1701+
* track of the number of requests to enable the KFD scheduler. When a request
1702+
* to enable the KFD scheduler is made, the reference count is decremented.
1703+
* When the reference count reaches zero, a delayed work is scheduled to
1704+
* enforce isolation after a delay of GFX_SLICE_PERIOD.
1705+
*
1706+
* When a request to disable the KFD scheduler is made, the function first
1707+
* checks if the reference count is zero. If it is, it cancels the delayed work
1708+
* for enforcing isolation and checks if the KFD scheduler is active. If the
1709+
* KFD scheduler is active, it sends a request to stop the KFD scheduler and
1710+
* sets the KFD scheduler state to inactive. Then, it increments the reference
1711+
* count.
1712+
*
1713+
* The function is synchronized using the kfd_sch_mutex to ensure that the KFD
1714+
* scheduler state and reference count are updated atomically.
1715+
*
1716+
* Note: If the reference count is already zero when a request to enable the
1717+
* KFD scheduler is made, it means there's an imbalance bug somewhere. The
1718+
* function triggers a warning in this case.
1719+
*/
1720+
static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
1721+
bool enable)
1722+
{
1723+
mutex_lock(&adev->gfx.kfd_sch_mutex);
1724+
1725+
if (enable) {
1726+
/* If the count is already 0, it means there's an imbalance bug somewhere.
1727+
* Note that the bug may be in a different caller than the one which triggers the
1728+
* WARN_ON_ONCE.
1729+
*/
1730+
if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) {
1731+
dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n");
1732+
goto unlock;
1733+
}
1734+
1735+
adev->gfx.kfd_sch_req_count[idx]--;
1736+
1737+
if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
1738+
adev->gfx.kfd_sch_inactive[idx]) {
1739+
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
1740+
GFX_SLICE_PERIOD);
1741+
}
1742+
} else {
1743+
if (adev->gfx.kfd_sch_req_count[idx] == 0) {
1744+
cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work);
1745+
if (!adev->gfx.kfd_sch_inactive[idx]) {
1746+
amdgpu_amdkfd_stop_sched(adev, idx);
1747+
adev->gfx.kfd_sch_inactive[idx] = true;
1748+
}
1749+
}
1750+
1751+
adev->gfx.kfd_sch_req_count[idx]++;
1752+
}
1753+
1754+
unlock:
1755+
mutex_unlock(&adev->gfx.kfd_sch_mutex);
1756+
}
1757+
1758+
/**
1759+
* amdgpu_gfx_enforce_isolation_handler - work handler for enforcing shader isolation
1760+
*
1761+
* @work: work_struct.
1762+
*
1763+
* This function is the work handler for enforcing shader isolation on AMD GPUs.
1764+
* It counts the number of emitted fences for each GFX and compute ring. If there
1765+
* are any fences, it schedules the `enforce_isolation_work` to be run after a
1766+
* delay of `GFX_SLICE_PERIOD`. If there are no fences, it signals the Kernel Fusion
1767+
* Driver (KFD) to resume the runqueue. The function is synchronized using the
1768+
* `enforce_isolation_mutex`.
1769+
*/
1770+
void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
1771+
{
1772+
struct amdgpu_isolation_work *isolation_work =
1773+
container_of(work, struct amdgpu_isolation_work, work.work);
1774+
struct amdgpu_device *adev = isolation_work->adev;
1775+
u32 i, idx, fences = 0;
1776+
1777+
if (isolation_work->xcp_id == AMDGPU_XCP_NO_PARTITION)
1778+
idx = 0;
1779+
else
1780+
idx = isolation_work->xcp_id;
1781+
1782+
if (idx >= MAX_XCP)
1783+
return;
1784+
1785+
mutex_lock(&adev->enforce_isolation_mutex);
1786+
for (i = 0; i < AMDGPU_MAX_GFX_RINGS; ++i) {
1787+
if (isolation_work->xcp_id == adev->gfx.gfx_ring[i].xcp_id)
1788+
fences += amdgpu_fence_count_emitted(&adev->gfx.gfx_ring[i]);
1789+
}
1790+
for (i = 0; i < (AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES); ++i) {
1791+
if (isolation_work->xcp_id == adev->gfx.compute_ring[i].xcp_id)
1792+
fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
1793+
}
1794+
if (fences) {
1795+
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
1796+
GFX_SLICE_PERIOD);
1797+
} else {
1798+
/* Tell KFD to resume the runqueue */
1799+
if (adev->kfd.init_complete) {
1800+
WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]);
1801+
WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]);
1802+
amdgpu_amdkfd_start_sched(adev, idx);
1803+
adev->gfx.kfd_sch_inactive[idx] = false;
1804+
}
1805+
}
1806+
mutex_unlock(&adev->enforce_isolation_mutex);
1807+
}
1808+
1809+
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
1810+
{
1811+
struct amdgpu_device *adev = ring->adev;
1812+
u32 idx;
1813+
1814+
if (!adev->gfx.enable_cleaner_shader)
1815+
return;
1816+
1817+
if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION)
1818+
idx = 0;
1819+
else
1820+
idx = ring->xcp_id;
1821+
1822+
if (idx >= MAX_XCP)
1823+
return;
1824+
1825+
mutex_lock(&adev->enforce_isolation_mutex);
1826+
if (adev->enforce_isolation[idx]) {
1827+
if (adev->kfd.init_complete)
1828+
amdgpu_gfx_kfd_sch_ctrl(adev, idx, false);
1829+
}
1830+
mutex_unlock(&adev->enforce_isolation_mutex);
1831+
}
1832+
1833+
void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
1834+
{
1835+
struct amdgpu_device *adev = ring->adev;
1836+
u32 idx;
1837+
1838+
if (!adev->gfx.enable_cleaner_shader)
1839+
return;
1840+
1841+
if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION)
1842+
idx = 0;
1843+
else
1844+
idx = ring->xcp_id;
1845+
1846+
if (idx >= MAX_XCP)
1847+
return;
1848+
1849+
mutex_lock(&adev->enforce_isolation_mutex);
1850+
if (adev->enforce_isolation[idx]) {
1851+
if (adev->kfd.init_complete)
1852+
amdgpu_gfx_kfd_sch_ctrl(adev, idx, true);
1853+
}
1854+
mutex_unlock(&adev->enforce_isolation_mutex);
1855+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

+15
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "soc15.h"
3535
#include "amdgpu_ras.h"
3636
#include "amdgpu_ring_mux.h"
37+
#include "amdgpu_xcp.h"
3738

3839
/* GFX current status */
3940
#define AMDGPU_GFX_NORMAL_MODE 0x00000000L
@@ -343,6 +344,12 @@ struct amdgpu_me {
343344
DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_GFX_QUEUES);
344345
};
345346

347+
struct amdgpu_isolation_work {
348+
struct amdgpu_device *adev;
349+
u32 xcp_id;
350+
struct delayed_work work;
351+
};
352+
346353
struct amdgpu_gfx {
347354
struct mutex gpu_clock_mutex;
348355
struct amdgpu_gfx_config config;
@@ -454,6 +461,11 @@ struct amdgpu_gfx {
454461
void *cleaner_shader_cpu_ptr;
455462
const void *cleaner_shader_ptr;
456463
bool enable_cleaner_shader;
464+
struct amdgpu_isolation_work enforce_isolation[MAX_XCP];
465+
/* Mutex for synchronizing KFD scheduler operations */
466+
struct mutex kfd_sch_mutex;
467+
u64 kfd_sch_req_count[MAX_XCP];
468+
bool kfd_sch_inactive[MAX_XCP];
457469
};
458470

459471
struct amdgpu_gfx_ras_reg_entry {
@@ -563,6 +575,9 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
563575
const void *cleaner_shader_ptr);
564576
int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev);
565577
void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
578+
void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
579+
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
580+
void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
566581

567582
static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
568583
{

0 commit comments

Comments
 (0)