mirror of
https://github.com/torvalds/linux.git
synced 2024-09-20 06:53:04 +00:00
drm/amdkfd: Handle queue destroy buffer access race
Add helper function kfd_queue_unreference_buffers to reduce queue buffer refcount, separate it from release queue buffers. Because it is circular locking to hold dqm_lock to take vm lock, kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers first, but not release queue buffers, to handle error in case failed to hold vm lock. Then hold dqm_lock to remove queue from queue list and then release queue buffers. Restore process worker restore queue hold dqm_lock, will always find the queue with valid queue buffers. v2 (Felix): - renamed kfd_queue_unreference_buffer(s) to kfd_queue_unref_bo_va(s) - added two FIXME comments for follow up Signed-off-by: Philip Yang <Philip.Yang@amd.com> Signed-off-by: Felix Kuehling <felix.kuehling@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
70f83e7706
commit
a1fc9f584c
|
@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
|
|||
return 0;
|
||||
|
||||
err_create_queue:
|
||||
kfd_queue_unref_bo_vas(pdd, &q_properties);
|
||||
kfd_queue_release_buffers(pdd, &q_properties);
|
||||
err_acquire_queue_buf:
|
||||
err_sdma_engine_id:
|
||||
|
|
|
@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q);
|
|||
void print_queue(struct queue *q);
|
||||
int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
|
||||
u64 expected_size);
|
||||
void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
|
||||
void kfd_queue_buffer_put(struct amdgpu_bo **bo);
|
||||
int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
|
||||
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
|
||||
void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
|
||||
int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
|
||||
struct queue_properties *properties);
|
||||
void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
|
||||
|
||||
struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
|
||||
|
|
|
@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
|
|||
list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
|
||||
if (pqn->q) {
|
||||
pdd = kfd_get_process_device_data(pqn->q->device, pqm->process);
|
||||
kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
|
||||
kfd_queue_release_buffers(pdd, &pqn->q->properties);
|
||||
pqm_clean_queue_resource(pqm, pqn);
|
||||
}
|
||||
|
@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
|
|||
}
|
||||
|
||||
if (pqn->q) {
|
||||
retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
|
||||
retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties);
|
||||
if (retval)
|
||||
goto err_destroy_queue;
|
||||
|
||||
|
@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
|
|||
if (retval != -ETIME)
|
||||
goto err_destroy_queue;
|
||||
}
|
||||
|
||||
kfd_queue_release_buffers(pdd, &pqn->q->properties);
|
||||
pqm_clean_queue_resource(pqm, pqn);
|
||||
uninit_queue(pqn->q);
|
||||
}
|
||||
|
@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
|
|||
return -EFAULT;
|
||||
}
|
||||
|
||||
kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);
|
||||
kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo);
|
||||
kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
|
||||
amdgpu_bo_unreserve(vm->root.bo);
|
||||
|
||||
pqn->q->properties.ring_bo = p->ring_bo;
|
||||
|
|
|
@ -224,16 +224,9 @@ out_err:
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
|
||||
/* FIXME: remove this function, just call amdgpu_bo_unref directly */
|
||||
void kfd_queue_buffer_put(struct amdgpu_bo **bo)
|
||||
{
|
||||
if (*bo) {
|
||||
struct amdgpu_bo_va *bo_va;
|
||||
|
||||
bo_va = amdgpu_vm_bo_find(vm, *bo);
|
||||
if (bo_va)
|
||||
bo_va->queue_refcount--;
|
||||
}
|
||||
|
||||
amdgpu_bo_unref(bo);
|
||||
}
|
||||
|
||||
|
@ -327,6 +320,10 @@ out_unreserve:
|
|||
out_err_unreserve:
|
||||
amdgpu_bo_unreserve(vm->root.bo);
|
||||
out_err_release:
|
||||
/* FIXME: make a _locked version of this that can be called before
|
||||
* dropping the VM reservation.
|
||||
*/
|
||||
kfd_queue_unref_bo_vas(pdd, properties);
|
||||
kfd_queue_release_buffers(pdd, properties);
|
||||
return err;
|
||||
}
|
||||
|
@ -334,22 +331,13 @@ out_err_release:
|
|||
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
|
||||
{
|
||||
struct kfd_topology_device *topo_dev;
|
||||
struct amdgpu_vm *vm;
|
||||
u32 total_cwsr_size;
|
||||
int err;
|
||||
|
||||
vm = drm_priv_to_vm(pdd->drm_priv);
|
||||
err = amdgpu_bo_reserve(vm->root.bo, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
kfd_queue_buffer_put(vm, &properties->wptr_bo);
|
||||
kfd_queue_buffer_put(vm, &properties->rptr_bo);
|
||||
kfd_queue_buffer_put(vm, &properties->ring_bo);
|
||||
kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
|
||||
kfd_queue_buffer_put(vm, &properties->cwsr_bo);
|
||||
|
||||
amdgpu_bo_unreserve(vm->root.bo);
|
||||
kfd_queue_buffer_put(&properties->wptr_bo);
|
||||
kfd_queue_buffer_put(&properties->rptr_bo);
|
||||
kfd_queue_buffer_put(&properties->ring_bo);
|
||||
kfd_queue_buffer_put(&properties->eop_buf_bo);
|
||||
kfd_queue_buffer_put(&properties->cwsr_bo);
|
||||
|
||||
topo_dev = kfd_topology_device_by_id(pdd->dev->id);
|
||||
if (!topo_dev)
|
||||
|
@ -362,6 +350,38 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
|
|||
return 0;
|
||||
}
|
||||
|
||||
void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
|
||||
{
|
||||
if (*bo) {
|
||||
struct amdgpu_bo_va *bo_va;
|
||||
|
||||
bo_va = amdgpu_vm_bo_find(vm, *bo);
|
||||
if (bo_va && bo_va->queue_refcount)
|
||||
bo_va->queue_refcount--;
|
||||
}
|
||||
}
|
||||
|
||||
int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
|
||||
struct queue_properties *properties)
|
||||
{
|
||||
struct amdgpu_vm *vm;
|
||||
int err;
|
||||
|
||||
vm = drm_priv_to_vm(pdd->drm_priv);
|
||||
err = amdgpu_bo_reserve(vm->root.bo, false);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
kfd_queue_unref_bo_va(vm, &properties->wptr_bo);
|
||||
kfd_queue_unref_bo_va(vm, &properties->rptr_bo);
|
||||
kfd_queue_unref_bo_va(vm, &properties->ring_bo);
|
||||
kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo);
|
||||
kfd_queue_unref_bo_va(vm, &properties->cwsr_bo);
|
||||
|
||||
amdgpu_bo_unreserve(vm->root.bo);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define SGPR_SIZE_PER_CU 0x4000
|
||||
#define LDS_SIZE_PER_CU 0x10000
|
||||
#define HWREG_SIZE_PER_CU 0x1000
|
||||
|
|
Loading…
Reference in New Issue
Block a user