drm/amdgpu: disable ras query and iject during gpu reset
added flag to ras context to indicate if ras query functionality is ready Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: John Clements <john.clements@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
66399248fe
commit
61380faa4b
@@ -4168,6 +4168,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
need_full_reset = job_signaled = false;
|
need_full_reset = job_signaled = false;
|
||||||
INIT_LIST_HEAD(&device_list);
|
INIT_LIST_HEAD(&device_list);
|
||||||
|
|
||||||
|
amdgpu_ras_set_error_query_ready(adev, false);
|
||||||
|
|
||||||
dev_info(adev->dev, "GPU %s begin!\n",
|
dev_info(adev->dev, "GPU %s begin!\n",
|
||||||
(in_ras_intr && !use_baco) ? "jobs stop":"reset");
|
(in_ras_intr && !use_baco) ? "jobs stop":"reset");
|
||||||
|
|
||||||
@@ -4224,6 +4226,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
/* block all schedulers and reset given job's ring */
|
/* block all schedulers and reset given job's ring */
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
||||||
if (tmp_adev != adev) {
|
if (tmp_adev != adev) {
|
||||||
|
amdgpu_ras_set_error_query_ready(tmp_adev, false);
|
||||||
amdgpu_device_lock_adev(tmp_adev, false);
|
amdgpu_device_lock_adev(tmp_adev, false);
|
||||||
if (!amdgpu_sriov_vf(tmp_adev))
|
if (!amdgpu_sriov_vf(tmp_adev))
|
||||||
amdgpu_amdkfd_pre_reset(tmp_adev);
|
amdgpu_amdkfd_pre_reset(tmp_adev);
|
||||||
|
|||||||
@@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
|
|||||||
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
|
||||||
uint64_t addr);
|
uint64_t addr);
|
||||||
|
|
||||||
|
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
|
||||||
|
{
|
||||||
|
if (adev)
|
||||||
|
amdgpu_ras_get_context(adev)->error_query_ready = ready;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
|
||||||
|
{
|
||||||
|
if (adev)
|
||||||
|
return amdgpu_ras_get_context(adev)->error_query_ready;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
|
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
|
||||||
size_t size, loff_t *pos)
|
size_t size, loff_t *pos)
|
||||||
{
|
{
|
||||||
@@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
|
|||||||
struct ras_debug_if data;
|
struct ras_debug_if data;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
if (amdgpu_ras_intr_triggered()) {
|
if (!amdgpu_ras_get_error_query_ready(adev)) {
|
||||||
DRM_WARN("RAS WARN: error injection currently inaccessible\n");
|
DRM_WARN("RAS WARN: error injection currently inaccessible\n");
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
@@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
|
|||||||
.head = obj->head,
|
.head = obj->head,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (amdgpu_ras_intr_triggered())
|
if (!amdgpu_ras_get_error_query_ready(obj->adev))
|
||||||
return snprintf(buf, PAGE_SIZE,
|
return snprintf(buf, PAGE_SIZE,
|
||||||
"Query currently inaccessible\n");
|
"Query currently inaccessible\n");
|
||||||
|
|
||||||
@@ -1886,8 +1900,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* in resume phase, no need to create ras fs node */
|
/* in resume phase, no need to create ras fs node */
|
||||||
if (adev->in_suspend || adev->in_gpu_reset)
|
if (adev->in_suspend || adev->in_gpu_reset) {
|
||||||
|
amdgpu_ras_set_error_query_ready(adev, true);
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (ih_info->cb) {
|
if (ih_info->cb) {
|
||||||
r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
|
r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
|
||||||
@@ -1899,6 +1915,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
|
|||||||
if (r)
|
if (r)
|
||||||
goto sysfs;
|
goto sysfs;
|
||||||
|
|
||||||
|
amdgpu_ras_set_error_query_ready(adev, true);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
cleanup:
|
cleanup:
|
||||||
amdgpu_ras_sysfs_remove(adev, ras_block);
|
amdgpu_ras_sysfs_remove(adev, ras_block);
|
||||||
|
|||||||
@@ -334,6 +334,8 @@ struct amdgpu_ras {
|
|||||||
uint32_t flags;
|
uint32_t flags;
|
||||||
bool reboot;
|
bool reboot;
|
||||||
struct amdgpu_ras_eeprom_control eeprom_control;
|
struct amdgpu_ras_eeprom_control eeprom_control;
|
||||||
|
|
||||||
|
bool error_query_ready;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ras_fs_data {
|
struct ras_fs_data {
|
||||||
@@ -629,4 +631,6 @@ static inline void amdgpu_ras_intr_cleared(void)
|
|||||||
|
|
||||||
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
|
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
|
||||||
|
|
||||||
|
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user