mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-03-22 07:27:12 +08:00
drm/amdgpu: fix fairness in enforce isolation handling
Make sure KFD gets a turn when serializing access to the GC IP. Currently non-KFD jobs can starve KFD if they submit often enough. This patch prevents that by stalling non-KFD if its time period has elapsed. v2: fix units v3: check enablement properly Acked-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -118,7 +118,7 @@
|
||||
|
||||
#define MAX_GPU_INSTANCE 64
|
||||
|
||||
#define GFX_SLICE_PERIOD msecs_to_jiffies(250)
|
||||
#define GFX_SLICE_PERIOD_MS 250
|
||||
|
||||
struct amdgpu_gpu_instance {
|
||||
struct amdgpu_device *adev;
|
||||
|
||||
@@ -1752,7 +1752,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
|
||||
if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
|
||||
adev->gfx.kfd_sch_inactive[idx]) {
|
||||
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
|
||||
GFX_SLICE_PERIOD);
|
||||
msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx]));
|
||||
}
|
||||
} else {
|
||||
if (adev->gfx.kfd_sch_req_count[idx] == 0) {
|
||||
@@ -1807,8 +1807,9 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
|
||||
fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
|
||||
}
|
||||
if (fences) {
|
||||
/* we've already had our timeslice, so let's wrap this up */
|
||||
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
|
||||
GFX_SLICE_PERIOD);
|
||||
msecs_to_jiffies(1));
|
||||
} else {
|
||||
/* Tell KFD to resume the runqueue */
|
||||
if (adev->kfd.init_complete) {
|
||||
@@ -1821,6 +1822,51 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
|
||||
mutex_unlock(&adev->enforce_isolation_mutex);
|
||||
}
|
||||
|
||||
static void
|
||||
amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
|
||||
u32 idx)
|
||||
{
|
||||
unsigned long cjiffies;
|
||||
bool wait = false;
|
||||
|
||||
mutex_lock(&adev->enforce_isolation_mutex);
|
||||
if (adev->enforce_isolation[idx]) {
|
||||
/* set the initial values if nothing is set */
|
||||
if (!adev->gfx.enforce_isolation_jiffies[idx]) {
|
||||
adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
|
||||
adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
|
||||
}
|
||||
/* Make sure KFD gets a chance to run */
|
||||
if (amdgpu_amdkfd_compute_active(adev, idx)) {
|
||||
cjiffies = jiffies;
|
||||
if (time_after(cjiffies, adev->gfx.enforce_isolation_jiffies[idx])) {
|
||||
cjiffies -= adev->gfx.enforce_isolation_jiffies[idx];
|
||||
if ((jiffies_to_msecs(cjiffies) >= GFX_SLICE_PERIOD_MS)) {
|
||||
/* if our time is up, let KGD work drain before scheduling more */
|
||||
wait = true;
|
||||
/* reset the timer period */
|
||||
adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
|
||||
} else {
|
||||
/* set the timer period to what's left in our time slice */
|
||||
adev->gfx.enforce_isolation_time[idx] =
|
||||
GFX_SLICE_PERIOD_MS - jiffies_to_msecs(cjiffies);
|
||||
}
|
||||
} else {
|
||||
/* if jiffies wrap around we will just wait a little longer */
|
||||
adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
|
||||
}
|
||||
} else {
|
||||
/* if there is no KFD work, then set the full slice period */
|
||||
adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
|
||||
adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&adev->enforce_isolation_mutex);
|
||||
|
||||
if (wait)
|
||||
msleep(GFX_SLICE_PERIOD_MS);
|
||||
}
|
||||
|
||||
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
|
||||
{
|
||||
struct amdgpu_device *adev = ring->adev;
|
||||
@@ -1837,6 +1883,9 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
|
||||
if (idx >= MAX_XCP)
|
||||
return;
|
||||
|
||||
/* Don't submit more work until KFD has had some time */
|
||||
amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
|
||||
|
||||
mutex_lock(&adev->enforce_isolation_mutex);
|
||||
if (adev->enforce_isolation[idx]) {
|
||||
if (adev->kfd.init_complete)
|
||||
|
||||
@@ -472,6 +472,8 @@ struct amdgpu_gfx {
|
||||
struct mutex kfd_sch_mutex;
|
||||
u64 kfd_sch_req_count[MAX_XCP];
|
||||
bool kfd_sch_inactive[MAX_XCP];
|
||||
unsigned long enforce_isolation_jiffies[MAX_XCP];
|
||||
unsigned long enforce_isolation_time[MAX_XCP];
|
||||
};
|
||||
|
||||
struct amdgpu_gfx_ras_reg_entry {
|
||||
|
||||
Reference in New Issue
Block a user