drm/amdgpu/gfx9.4.3: Implement compute pipe reset

Implement the compute pipe reset, and the driver will
fallback to pipe reset when queue reset fails.
The pipe reset only deactivates the queue which is
scheduled in the pipe, and meanwhile the MEC pipe
will be reset to the firmware _start pointer. So,
it seems pipe reset will cost more cycles than the
queue reset; therefore, the driver tries to recover
by doing queue reset first.

Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Prike Liang <Prike.Liang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Prike Liang 2024-08-29 11:47:12 +08:00 committed by Alex Deucher
parent 6c0a7c3c69
commit ad17b124c3

View File

@ -3469,6 +3469,98 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable)
}
}
static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
uint32_t pipe, uint32_t queue,
uint32_t xcc_id)
{
int i, r;
/* make sure dequeue is complete*/
gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
mutex_lock(&adev->srbm_mutex);
soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
for (i = 0; i < adev->usec_timeout; i++) {
if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
break;
udelay(1);
}
if (i >= adev->usec_timeout)
r = -ETIMEDOUT;
else
r = 0;
soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
mutex_unlock(&adev->srbm_mutex);
gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
return r;
}
static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
{
/*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as well.*/
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
adev->gfx.mec_fw_version >= 0x0000009b)
return true;
else
dev_warn_once(adev->dev, "Please use the latest MEC version to see whether support pipe reset\n");
return false;
}
static int gfx_v9_4_3_reset_hw_pipe(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
uint32_t reset_pipe, clean_pipe;
int r;
if (!gfx_v9_4_3_pipe_reset_support(adev))
return -EINVAL;
gfx_v9_4_3_xcc_set_safe_mode(adev, ring->xcc_id);
mutex_lock(&adev->srbm_mutex);
reset_pipe = RREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL);
clean_pipe = reset_pipe;
if (ring->me == 1) {
switch (ring->pipe) {
case 0:
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME1_PIPE0_RESET, 1);
break;
case 1:
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME1_PIPE1_RESET, 1);
break;
case 2:
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME1_PIPE2_RESET, 1);
break;
case 3:
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME1_PIPE3_RESET, 1);
break;
default:
break;
}
} else {
if (ring->pipe)
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME2_PIPE1_RESET, 1);
else
reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
MEC_ME2_PIPE0_RESET, 1);
}
WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, reset_pipe);
WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, clean_pipe);
mutex_unlock(&adev->srbm_mutex);
gfx_v9_4_3_xcc_unset_safe_mode(adev, ring->xcc_id);
r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id);
return r;
}
static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
unsigned int vmid)
{
@ -3476,7 +3568,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
struct amdgpu_ring *kiq_ring = &kiq->ring;
unsigned long flags;
int r, i;
int r;
if (!adev->debug_exp_resets)
return -EINVAL;
@ -3501,26 +3593,23 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
spin_unlock_irqrestore(&kiq->ring_lock, flags);
r = amdgpu_ring_test_ring(kiq_ring);
if (r)
return r;
/* make sure dequeue is complete*/
amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
mutex_lock(&adev->srbm_mutex);
soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id));
for (i = 0; i < adev->usec_timeout; i++) {
if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
break;
udelay(1);
}
if (i >= adev->usec_timeout)
r = -ETIMEDOUT;
soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
mutex_unlock(&adev->srbm_mutex);
amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
if (r) {
dev_err(adev->dev, "fail to wait on hqd deactive\n");
return r;
dev_err(adev->dev, "kiq ring test failed after ring: %s queue reset\n",
ring->name);
goto pipe_reset;
}
r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id);
if (r)
dev_err(adev->dev, "fail to wait on hqd deactive and will try pipe reset\n");
pipe_reset:
if(r) {
r = gfx_v9_4_3_reset_hw_pipe(ring);
dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
r ? "failed" : "successfully");
if (r)
return r;
}
r = amdgpu_bo_reserve(ring->mqd_obj, false);