drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
The exclusive mode has real-time limitation in reality, such like being done in 300ms. It's easy observed if running many VF/VMs in single host with heavy CPU workload. If we find the init fails due to exclusive mode timeout, try it again. v2: - rewrite the condition for readable value. v3: - fix typo, add comments for sleep Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: pding <Pixel.Ding@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
|||||||
|
|
||||||
r = amdgpu_init(adev);
|
r = amdgpu_init(adev);
|
||||||
if (r) {
|
if (r) {
|
||||||
|
/* failed in exclusive mode due to timeout */
|
||||||
|
if (amdgpu_sriov_vf(adev) &&
|
||||||
|
!amdgpu_sriov_runtime(adev) &&
|
||||||
|
amdgpu_virt_mmio_blocked(adev) &&
|
||||||
|
!amdgpu_virt_wait_reset(adev)) {
|
||||||
|
dev_err(adev->dev, "VF exclusive mode timeout\n");
|
||||||
|
r = -EAGAIN;
|
||||||
|
goto failed;
|
||||||
|
}
|
||||||
dev_err(adev->dev, "amdgpu_init failed\n");
|
dev_err(adev->dev, "amdgpu_init failed\n");
|
||||||
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
|
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
|
||||||
amdgpu_fini(adev);
|
amdgpu_fini(adev);
|
||||||
@@ -2390,6 +2399,7 @@ failed:
|
|||||||
amdgpu_vf_error_trans_all(adev);
|
amdgpu_vf_error_trans_all(adev);
|
||||||
if (runtime)
|
if (runtime)
|
||||||
vga_switcheroo_fini_domain_pm_ops(adev->dev);
|
vga_switcheroo_fini_domain_pm_ops(adev->dev);
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ done_free:
|
|||||||
int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
|
int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
|
||||||
{
|
{
|
||||||
struct amdgpu_device *adev;
|
struct amdgpu_device *adev;
|
||||||
int r, acpi_status;
|
int r, acpi_status, retry = 0;
|
||||||
|
|
||||||
#ifdef CONFIG_DRM_AMDGPU_SI
|
#ifdef CONFIG_DRM_AMDGPU_SI
|
||||||
if (!amdgpu_si_support) {
|
if (!amdgpu_si_support) {
|
||||||
@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
retry_init:
|
||||||
|
|
||||||
adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
|
adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
|
||||||
if (adev == NULL) {
|
if (adev == NULL) {
|
||||||
@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
|
|||||||
* VRAM allocation
|
* VRAM allocation
|
||||||
*/
|
*/
|
||||||
r = amdgpu_device_init(adev, dev, dev->pdev, flags);
|
r = amdgpu_device_init(adev, dev, dev->pdev, flags);
|
||||||
if (r) {
|
if (r == -EAGAIN && ++retry <= 3) {
|
||||||
|
adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
|
||||||
|
adev->virt.ops = NULL;
|
||||||
|
amdgpu_device_fini(adev);
|
||||||
|
kfree(adev);
|
||||||
|
dev->dev_private = NULL;
|
||||||
|
/* Don't request EX mode too frequently which is attacking */
|
||||||
|
msleep(5000);
|
||||||
|
dev_err(&dev->pdev->dev, "retry init %d\n", retry);
|
||||||
|
goto retry_init;
|
||||||
|
} else if (r) {
|
||||||
dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
|
dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user