drm/amdkfd: APIs to stop/start KFD scheduling

Provide amdgpu_amdkfd_stop_sched() for amdgpu to stop KFD scheduling
compute work on HIQ. amdgpu_amdkfd_start_sched() resumes the scheduling.
When amdgpu_amdkfd_stop_sched is called, KFD will unmap queues from
runlist. If users send ioctls to KFD to create queues, they'll be added
but those queues won't be mapped to runlist (so not scheduled) until
amdgpu_amdkfd_start_sched is called.

v2: fix build (Alex)

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Amber Lin 2024-07-29 14:22:30 -04:00 committed by Alex Deucher
parent b1f49ff9cb
commit 234eebe161
5 changed files with 137 additions and 1 deletions

View File

@ -887,3 +887,21 @@ free_ring_funcs:
return r;
}
/* Stop scheduling on KFD */
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id)
{
if (!adev->kfd.init_complete)
return 0;
return kgd2kfd_stop_sched(adev->kfd.dev, node_id);
}
/* Start scheduling on KFD */
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
{
if (!adev->kfd.init_complete)
return 0;
return kgd2kfd_start_sched(adev->kfd.dev, node_id);
}

View File

@ -264,6 +264,8 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
uint32_t *payload);
int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
u32 inst);
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
/* Read user wptr from a specified user address space with page fault
* disabled. The memory must be pinned and mapped to the hardware when
@ -426,6 +428,8 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
int kgd2kfd_check_and_lock_kfd(void);
void kgd2kfd_unlock_kfd(void);
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
#else
static inline int kgd2kfd_init(void)
{
@ -496,5 +500,15 @@ static inline int kgd2kfd_check_and_lock_kfd(void)
static inline void kgd2kfd_unlock_kfd(void)
{
}
static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
{
return 0;
}
static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
return 0;
}
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

View File

@ -1446,6 +1446,45 @@ void kgd2kfd_unlock_kfd(void)
mutex_unlock(&kfd_processes_mutex);
}
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;
int ret;
if (!kfd->init_complete)
return 0;
if (node_id >= kfd->num_nodes) {
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
node_id, kfd->num_nodes - 1);
return -EINVAL;
}
node = kfd->nodes[node_id];
ret = node->dqm->ops.unhalt(node->dqm);
if (ret)
dev_err(kfd_device, "Error in starting scheduler\n");
return ret;
}
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;
if (!kfd->init_complete)
return 0;
if (node_id >= kfd->num_nodes) {
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
node_id, kfd->num_nodes - 1);
return -EINVAL;
}
node = kfd->nodes[node_id];
return node->dqm->ops.halt(node->dqm);
}
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS

View File

@ -1679,6 +1679,60 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
return 0;
}
/* halt_cpsch:
* Unmap queues so the schedule doesn't continue remaining jobs in the queue.
* Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch
* is called.
*/
static int halt_cpsch(struct device_queue_manager *dqm)
{
int ret = 0;
dqm_lock(dqm);
if (!dqm->sched_running) {
dqm_unlock(dqm);
return 0;
}
WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n");
if (!dqm->is_hws_hang) {
if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD, false);
else
ret = remove_all_queues_mes(dqm);
}
dqm->sched_halt = true;
dqm_unlock(dqm);
return ret;
}
/* unhalt_cpsch
* Unset dqm->sched_halt and map queues back to runlist
*/
static int unhalt_cpsch(struct device_queue_manager *dqm)
{
int ret = 0;
dqm_lock(dqm);
if (!dqm->sched_running || !dqm->sched_halt) {
WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n");
dqm_unlock(dqm);
return 0;
}
dqm->sched_halt = false;
if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
0, USE_DEFAULT_GRACE_PERIOD);
dqm_unlock(dqm);
return ret;
}
static int start_cpsch(struct device_queue_manager *dqm)
{
struct device *dev = dqm->dev->adev->dev;
@ -1984,7 +2038,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
struct device *dev = dqm->dev->adev->dev;
int retval;
if (!dqm->sched_running)
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0)
return 0;
@ -2727,6 +2781,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
dqm->ops.halt = halt_cpsch;
dqm->ops.unhalt = unhalt_cpsch;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;

View File

@ -106,6 +106,12 @@ union GRBM_GFX_INDEX_BITS {
* @uninitialize: Destroys all the device queue manager resources allocated in
* initialize routine.
*
* @halt: This routine unmaps queues from runlist and set halt status to true
* so no more queues will be mapped to runlist until unhalt.
*
* @unhalt: This routine unset halt status to flase and maps queues back to
* runlist.
*
* @create_kernel_queue: Creates kernel queue. Used for debug queue.
*
* @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
@ -153,6 +159,8 @@ struct device_queue_manager_ops {
int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm);
int (*halt)(struct device_queue_manager *dqm);
int (*unhalt)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd);
@ -264,6 +272,7 @@ struct device_queue_manager {
struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
bool sched_halt;
/* used for GFX 9.4.3 only */
uint32_t current_logical_xcc_start;