drm/amdgpu: add graceful VM fault handling v3
Next step towards HMM support. For now just silence the retry fault and optionally redirect the request to the dummy page. v2: make sure the VM is not destroyed while we handle the fault. v3: fix VM destroy check, cleanup comments Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
b65709a921
commit
ec671737f8
@@ -3126,3 +3126,76 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* amdgpu_vm_handle_fault - graceful handling of VM faults.
|
||||||
|
* @adev: amdgpu device pointer
|
||||||
|
* @pasid: PASID of the VM
|
||||||
|
* @addr: Address of the fault
|
||||||
|
*
|
||||||
|
* Try to gracefully handle a VM fault. Return true if the fault was handled and
|
||||||
|
* shouldn't be reported any more.
|
||||||
|
*/
|
||||||
|
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
|
||||||
|
uint64_t addr)
|
||||||
|
{
|
||||||
|
struct amdgpu_bo *root;
|
||||||
|
uint64_t value, flags;
|
||||||
|
struct amdgpu_vm *vm;
|
||||||
|
long r;
|
||||||
|
|
||||||
|
spin_lock(&adev->vm_manager.pasid_lock);
|
||||||
|
vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
|
||||||
|
if (vm)
|
||||||
|
root = amdgpu_bo_ref(vm->root.base.bo);
|
||||||
|
else
|
||||||
|
root = NULL;
|
||||||
|
spin_unlock(&adev->vm_manager.pasid_lock);
|
||||||
|
|
||||||
|
if (!root)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
r = amdgpu_bo_reserve(root, true);
|
||||||
|
if (r)
|
||||||
|
goto error_unref;
|
||||||
|
|
||||||
|
/* Double check that the VM still exists */
|
||||||
|
spin_lock(&adev->vm_manager.pasid_lock);
|
||||||
|
vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
|
||||||
|
if (vm && vm->root.base.bo != root)
|
||||||
|
vm = NULL;
|
||||||
|
spin_unlock(&adev->vm_manager.pasid_lock);
|
||||||
|
if (!vm)
|
||||||
|
goto error_unlock;
|
||||||
|
|
||||||
|
addr /= AMDGPU_GPU_PAGE_SIZE;
|
||||||
|
flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SNOOPED |
|
||||||
|
AMDGPU_PTE_SYSTEM;
|
||||||
|
|
||||||
|
if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
|
||||||
|
/* Redirect the access to the dummy page */
|
||||||
|
value = adev->dummy_page_addr;
|
||||||
|
flags |= AMDGPU_PTE_EXECUTABLE | AMDGPU_PTE_READABLE |
|
||||||
|
AMDGPU_PTE_WRITEABLE;
|
||||||
|
} else {
|
||||||
|
/* Let the hw retry silently on the PTE */
|
||||||
|
value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = amdgpu_vm_bo_update_mapping(adev, vm, true, NULL, addr, addr + 1,
|
||||||
|
flags, value, NULL, NULL);
|
||||||
|
if (r)
|
||||||
|
goto error_unlock;
|
||||||
|
|
||||||
|
r = amdgpu_vm_update_pdes(adev, vm, true);
|
||||||
|
|
||||||
|
error_unlock:
|
||||||
|
amdgpu_bo_unreserve(root);
|
||||||
|
if (r < 0)
|
||||||
|
DRM_ERROR("Can't handle page fault (%ld)\n", r);
|
||||||
|
|
||||||
|
error_unref:
|
||||||
|
amdgpu_bo_unref(&root);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|||||||
@@ -413,6 +413,8 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
|
|||||||
|
|
||||||
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
|
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
|
||||||
struct amdgpu_task_info *task_info);
|
struct amdgpu_task_info *task_info);
|
||||||
|
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
|
||||||
|
uint64_t addr);
|
||||||
|
|
||||||
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
|
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
|
||||||
|
|
||||||
|
|||||||
@@ -380,6 +380,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* If it's the first fault for this address, process it normally */
|
/* If it's the first fault for this address, process it normally */
|
||||||
|
if (retry_fault && !in_interrupt() &&
|
||||||
|
amdgpu_vm_handle_fault(adev, entry->pasid, addr))
|
||||||
|
return 1; /* This also prevents sending it to KFD */
|
||||||
|
|
||||||
if (!amdgpu_sriov_vf(adev)) {
|
if (!amdgpu_sriov_vf(adev)) {
|
||||||
/*
|
/*
|
||||||
* Issue a dummy read to wait for the status register to
|
* Issue a dummy read to wait for the status register to
|
||||||
|
|||||||
Reference in New Issue
Block a user