mirror of
https://github.com/torvalds/linux.git
synced 2024-12-26 21:02:19 +00:00
habanalabs: extend process wait timeout in device fine
Processes that use our device are likely to use at the same time other devices such as remote storage. In case our device is removed and a user process is still using the device, we need to kill the user process. However, if that process has a thread waiting for i/o to complete on remote storage, for example, the process won't terminate. Let's give it enough time to terminate before giving up. Signed-off-by: Oded Gabbay <ogabbay@kernel.org> Reviewed-by: Tomer Tayar <ttayar@habana.ai>
This commit is contained in:
parent
f69c3e460a
commit
b585daa89d
@ -2300,14 +2300,16 @@ void hl_device_fini(struct hl_device *hdev)
|
||||
*/
|
||||
dev_info(hdev->dev,
|
||||
"Waiting for all processes to exit (timeout of %u seconds)",
|
||||
HL_PENDING_RESET_LONG_SEC);
|
||||
HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI);
|
||||
|
||||
rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false);
|
||||
hdev->process_kill_trial_cnt = 0;
|
||||
rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false);
|
||||
if (rc) {
|
||||
dev_crit(hdev->dev, "Failed to kill all open processes\n");
|
||||
device_disable_open_processes(hdev, false);
|
||||
}
|
||||
|
||||
hdev->process_kill_trial_cnt = 0;
|
||||
rc = device_kill_open_processes(hdev, 0, true);
|
||||
if (rc) {
|
||||
dev_crit(hdev->dev, "Failed to kill all control device open processes\n");
|
||||
|
@ -50,9 +50,14 @@ struct hl_fpriv;
|
||||
#define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT)
|
||||
#define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK)
|
||||
|
||||
#define HL_PENDING_RESET_PER_SEC 10
|
||||
#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */
|
||||
#define HL_PENDING_RESET_LONG_SEC 60
|
||||
#define HL_PENDING_RESET_PER_SEC 10
|
||||
#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */
|
||||
#define HL_PENDING_RESET_LONG_SEC 60
|
||||
/*
|
||||
* In device fini, wait 10 minutes for user processes to be terminated after we kill them.
|
||||
* This is needed to prevent situation of clearing resources while user processes are still alive.
|
||||
*/
|
||||
#define HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI 600
|
||||
|
||||
#define HL_HARD_RESET_MAX_TIMEOUT 120
|
||||
#define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3)
|
||||
|
Loading…
Reference in New Issue
Block a user