drm/amdgpu: add gang submit frontend v6

Allows submitting jobs as gang which needs to run on multiple engines at the
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no
joining of work at the end.

v2: rebase and fix review comments from Andrey and Yogesh
v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang
    leader only when necessary
v4: fix order of pushing jobs and adding fences found by Trigger.
v5: fix job index calculation and adding IBs to jobs
v6: fix typo found by Alex

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Christian König 2022-03-02 16:39:34 +01:00 committed by Alex Deucher
parent 68ce8b2422
commit 4624459c84
5 changed files with 195 additions and 99 deletions

View File

@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
ib->length_dw = ib_len;
/* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid;
job->num_ibs = 1;
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);

View File

@ -64,11 +64,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
return 0;
}
static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
struct drm_amdgpu_cs_chunk_ib *chunk_ib,
unsigned int *num_ibs)
static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
struct drm_amdgpu_cs_chunk_ib *chunk_ib)
{
struct drm_sched_entity *entity;
unsigned int i;
int r;
r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
@ -77,17 +77,38 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
if (r)
return r;
/* Abort if there is no run queue associated with this entity.
* Possibly because of disabled HW IP*/
/*
* Abort if there is no run queue associated with this entity.
* Possibly because of disabled HW IP.
*/
if (entity->rq == NULL)
return -EINVAL;
/* Currently we don't support submitting to multiple entities */
if (p->entity && p->entity != entity)
/* Check if we can add this IB to some existing job */
for (i = 0; i < p->gang_size; ++i)
if (p->entities[i] == entity)
return i;
/* If not increase the gang size if possible */
if (i == AMDGPU_CS_GANG_SIZE)
return -EINVAL;
p->entity = entity;
++(*num_ibs);
p->entities[i] = entity;
p->gang_size = i + 1;
return i;
}
static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
struct drm_amdgpu_cs_chunk_ib *chunk_ib,
unsigned int *num_ibs)
{
int r;
r = amdgpu_cs_job_idx(p, chunk_ib);
if (r < 0)
return r;
++(num_ibs[r]);
return 0;
}
@ -161,11 +182,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
struct amdgpu_vm *vm = &fpriv->vm;
uint64_t *chunk_array_user;
uint64_t *chunk_array;
unsigned size, num_ibs = 0;
uint32_t uf_offset = 0;
unsigned int size;
int ret;
int i;
@ -228,7 +250,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
goto free_partial_kdata;
ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs);
ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
if (ret)
goto free_partial_kdata;
break;
@ -265,21 +287,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
}
}
ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm);
if (ret)
goto free_all_kdata;
if (!p->gang_size)
return -EINVAL;
ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm);
if (ret)
goto free_all_kdata;
for (i = 0; i < p->gang_size; ++i) {
ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm);
if (ret)
goto free_all_kdata;
if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i],
&fpriv->vm);
if (ret)
goto free_all_kdata;
}
p->gang_leader = p->jobs[p->gang_size - 1];
if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
ret = -ECANCELED;
goto free_all_kdata;
}
if (p->uf_entry.tv.bo)
p->job->uf_addr = uf_offset;
p->gang_leader->uf_addr = uf_offset;
kvfree(chunk_array);
/* Use this opportunity to fill in task info for the vm */
@ -303,17 +332,25 @@ free_chunk:
static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk,
unsigned int *num_ibs,
unsigned int *ce_preempt,
unsigned int *de_preempt)
{
struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_ring *ring = amdgpu_job_ring(p->job);
struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_ring *ring;
struct amdgpu_job *job;
struct amdgpu_ib *ib;
int r;
r = amdgpu_cs_job_idx(p, chunk_ib);
if (r < 0)
return r;
job = p->jobs[r];
ring = amdgpu_job_ring(job);
ib = &job->ibs[job->num_ibs++];
/* MM engine doesn't support user fences */
if (p->uf_entry.tv.bo && ring->funcs->no_user_fence)
return -EINVAL;
@ -332,7 +369,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
}
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
chunk_ib->ib_bytes : 0,
@ -345,8 +382,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
ib->gpu_addr = chunk_ib->va_start;
ib->length_dw = chunk_ib->ib_bytes / 4;
ib->flags = chunk_ib->flags;
(*num_ibs)++;
return 0;
}
@ -395,7 +430,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
dma_fence_put(old);
}
r = amdgpu_sync_fence(&p->job->sync, fence);
r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
dma_fence_put(fence);
if (r)
return r;
@ -417,7 +452,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
return r;
}
r = amdgpu_sync_fence(&p->job->sync, fence);
r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
dma_fence_put(fence);
return r;
@ -540,7 +575,7 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
{
unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0;
unsigned int ce_preempt = 0, de_preempt = 0;
int i, r;
for (i = 0; i < p->nchunks; ++i) {
@ -550,8 +585,7 @@ static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
switch (chunk->chunk_id) {
case AMDGPU_CHUNK_ID_IB:
r = amdgpu_cs_p2_ib(p, chunk, &num_ibs,
&ce_preempt, &de_preempt);
r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
if (r)
return r;
break;
@ -822,6 +856,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct list_head duplicates;
unsigned int i;
int r;
INIT_LIST_HEAD(&p->validated);
@ -905,16 +940,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
e->bo_va = amdgpu_vm_bo_find(vm, bo);
}
/* Move fence waiting after getting reservation lock of
* PD root. Then there is no need on a ctx mutex lock.
*/
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
goto error_validate;
}
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
&p->bytes_moved_vis_threshold);
p->bytes_moved = 0;
@ -942,13 +967,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
if (r)
goto error_validate;
p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(uf);
}
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
p->bytes_moved_vis);
amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
p->bo_list->gws_obj, p->bo_list->oa_obj);
for (i = 0; i < p->gang_size; ++i)
amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
p->bo_list->gws_obj,
p->bo_list->oa_obj);
return 0;
error_validate:
@ -967,20 +995,24 @@ out_free_user_pages:
return r;
}
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
{
int i;
int i, j;
if (!trace_amdgpu_cs_enabled())
return;
for (i = 0; i < parser->job->num_ibs; i++)
trace_amdgpu_cs(parser, i);
for (i = 0; i < p->gang_size; ++i) {
struct amdgpu_job *job = p->jobs[i];
for (j = 0; j < job->num_ibs; ++j)
trace_amdgpu_cs(p, job, &job->ibs[j]);
}
}
static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
struct amdgpu_job *job)
{
struct amdgpu_job *job = p->job;
struct amdgpu_ring *ring = amdgpu_job_ring(job);
unsigned int i;
int r;
@ -1021,12 +1053,12 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
memcpy(ib->ptr, kptr, ib->length_dw * 4);
amdgpu_bo_kunmap(aobj);
r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
r = amdgpu_ring_parse_cs(ring, p, job, ib);
if (r)
return r;
} else {
ib->ptr = (uint32_t *)kptr;
r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
amdgpu_bo_kunmap(aobj);
if (r)
return r;
@ -1036,19 +1068,31 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
return 0;
}
static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
{
unsigned int i;
int r;
for (i = 0; i < p->gang_size; ++i) {
r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
if (r)
return r;
}
return 0;
}
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_job *job = p->gang_leader;
struct amdgpu_device *adev = p->adev;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct amdgpu_bo_va *bo_va;
struct amdgpu_bo *bo;
unsigned int i;
int r;
if (!p->job->vm)
return 0;
r = amdgpu_vm_clear_freed(adev, vm, NULL);
if (r)
return r;
@ -1057,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
if (r)
return r;
@ -1068,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
if (r)
return r;
}
@ -1087,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
if (r)
return r;
}
@ -1100,11 +1144,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, vm->last_update);
r = amdgpu_sync_fence(&job->sync, vm->last_update);
if (r)
return r;
p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
for (i = 0; i < p->gang_size; ++i) {
job = p->jobs[i];
if (!job->vm)
continue;
job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
}
if (amdgpu_vm_debug) {
/* Invalidate all BOs to test for userspace bugs */
@ -1125,7 +1176,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_job *leader = p->gang_leader;
struct amdgpu_bo_list_entry *e;
unsigned int i;
int r;
list_for_each_entry(e, &p->validated, tv.head) {
@ -1135,12 +1188,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
sync_mode = amdgpu_bo_explicit_sync(bo) ?
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
&fpriv->vm);
if (r)
return r;
}
return 0;
for (i = 0; i < p->gang_size - 1; ++i) {
r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
if (r)
return r;
}
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]);
if (r && r != -ERESTARTSYS)
DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
return r;
}
static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
@ -1164,16 +1228,28 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct drm_sched_entity *entity = p->entity;
struct amdgpu_job *leader = p->gang_leader;
struct amdgpu_bo_list_entry *e;
struct amdgpu_job *job;
unsigned int i;
uint64_t seq;
int r;
job = p->job;
p->job = NULL;
for (i = 0; i < p->gang_size; ++i)
drm_sched_job_arm(&p->jobs[i]->base);
drm_sched_job_arm(&job->base);
for (i = 0; i < (p->gang_size - 1); ++i) {
struct dma_fence *fence;
fence = &p->jobs[i]->base.s_fence->scheduled;
r = amdgpu_sync_fence(&leader->sync, fence);
if (r)
goto error_cleanup;
}
if (p->gang_size > 1) {
for (i = 0; i < p->gang_size; ++i)
amdgpu_job_set_gang_leader(p->jobs[i], leader);
}
/* No memory allocation is allowed while holding the notifier lock.
* The lock is held until amdgpu_cs_submit is finished and fence is
@ -1191,45 +1267,57 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
}
if (r) {
r = -EAGAIN;
goto error_abort;
goto error_unlock;
}
p->fence = dma_fence_get(&job->base.s_fence->finished);
p->fence = dma_fence_get(&leader->base.s_fence->finished);
list_for_each_entry(e, &p->validated, tv.head) {
seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence);
/* Everybody except for the gang leader uses READ */
for (i = 0; i < (p->gang_size - 1); ++i) {
dma_resv_add_fence(e->tv.bo->base.resv,
&p->jobs[i]->base.s_fence->finished,
DMA_RESV_USAGE_READ);
}
/* The gang leader is remembered as writer */
e->tv.num_shared = 0;
}
seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1],
p->fence);
amdgpu_cs_post_dependencies(p);
if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
!p->ctx->preamble_presented) {
job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
p->ctx->preamble_presented = true;
}
cs->out.handle = seq;
job->uf_sequence = seq;
leader->uf_sequence = seq;
amdgpu_job_free_resources(job);
trace_amdgpu_cs_ioctl(job);
amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket);
drm_sched_entity_push_job(&job->base);
for (i = 0; i < p->gang_size; ++i) {
amdgpu_job_free_resources(p->jobs[i]);
trace_amdgpu_cs_ioctl(p->jobs[i]);
drm_sched_entity_push_job(&p->jobs[i]->base);
p->jobs[i] = NULL;
}
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
/* Make sure all BOs are remembered as writers */
amdgpu_bo_list_for_each_entry(e, p->bo_list)
e->tv.num_shared = 0;
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
mutex_unlock(&p->adev->notifier_lock);
mutex_unlock(&p->bo_list->bo_list_mutex);
return 0;
error_abort:
drm_sched_job_cleanup(&job->base);
error_unlock:
mutex_unlock(&p->adev->notifier_lock);
amdgpu_job_free(job);
error_cleanup:
for (i = 0; i < p->gang_size; ++i)
drm_sched_job_cleanup(&p->jobs[i]->base);
return r;
}
@ -1246,17 +1334,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
dma_fence_put(parser->fence);
if (parser->ctx) {
if (parser->ctx)
amdgpu_ctx_put(parser->ctx);
}
if (parser->bo_list)
amdgpu_bo_list_put(parser->bo_list);
for (i = 0; i < parser->nchunks; i++)
kvfree(parser->chunks[i].kdata);
kvfree(parser->chunks);
if (parser->job)
amdgpu_job_free(parser->job);
for (i = 0; i < parser->gang_size; ++i) {
if (parser->jobs[i])
amdgpu_job_free(parser->jobs[i]);
}
if (parser->uf_entry.tv.bo) {
struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo);
@ -1300,7 +1389,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
goto error_fini;
}
r = amdgpu_cs_patch_ibs(&parser);
r = amdgpu_cs_patch_jobs(&parser);
if (r)
goto error_backoff;

View File

@ -27,6 +27,8 @@
#include "amdgpu_bo_list.h"
#include "amdgpu_ring.h"
#define AMDGPU_CS_GANG_SIZE 4
struct amdgpu_bo_va_mapping;
struct amdgpu_cs_chunk {
@ -50,9 +52,11 @@ struct amdgpu_cs_parser {
unsigned nchunks;
struct amdgpu_cs_chunk *chunks;
/* scheduler job object */
struct amdgpu_job *job;
struct drm_sched_entity *entity;
/* scheduler job objects */
unsigned int gang_size;
struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE];
struct amdgpu_job *jobs[AMDGPU_CS_GANG_SIZE];
struct amdgpu_job *gang_leader;
/* buffer objects */
struct ww_acquire_ctx ticket;

View File

@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
*/
(*job)->base.sched = &adev->rings[0]->sched;
(*job)->vm = vm;
(*job)->num_ibs = num_ibs;
amdgpu_sync_create(&(*job)->sync);
amdgpu_sync_create(&(*job)->sched_sync);
@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
if (r)
return r;
(*job)->num_ibs = 1;
r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
if (r)
kfree(*job);

View File

@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
);
TRACE_EVENT(amdgpu_cs,
TP_PROTO(struct amdgpu_cs_parser *p, int i),
TP_ARGS(p, i),
TP_PROTO(struct amdgpu_cs_parser *p,
struct amdgpu_job *job,
struct amdgpu_ib *ib),
TP_ARGS(p, job, ib),
TP_STRUCT__entry(
__field(struct amdgpu_bo_list *, bo_list)
__field(u32, ring)
@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,
TP_fast_assign(
__entry->bo_list = p->bo_list;
__entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx;
__entry->dw = p->job->ibs[i].length_dw;
__entry->ring = to_amdgpu_ring(job->base.sched)->idx;
__entry->dw = ib->length_dw;
__entry->fences = amdgpu_fence_count_emitted(
to_amdgpu_ring(p->entity->rq->sched));
to_amdgpu_ring(job->base.sched));
),
TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
__entry->bo_list, __entry->ring, __entry->dw,