mirror of
https://github.com/torvalds/linux.git
synced 2024-12-26 21:02:19 +00:00
habanalabs: add a lock to protect multiple reset variables
Atomic operations during reset are replaced by a spinlock in order to have the ability to protect more than a single variable. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
eb13529191
commit
42eb2872e0
@ -250,8 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
|
||||
* Can't use generic function to check this because of special case
|
||||
* where we create a CB as part of the reset process
|
||||
*/
|
||||
if ((hdev->disabled) || ((atomic_read(&hdev->reset_info.in_reset)) &&
|
||||
(ctx_id != HL_KERNEL_ASID_ID))) {
|
||||
if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) {
|
||||
dev_warn_ratelimited(hdev->dev,
|
||||
"Device is disabled or in reset. Can't create new CBs\n");
|
||||
rc = -EBUSY;
|
||||
|
@ -527,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data)
|
||||
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
|
||||
struct hl_device *hdev = dev_entry->hdev;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev,
|
||||
"Can't check device idle during reset\n");
|
||||
return 0;
|
||||
@ -658,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
|
||||
ssize_t rc;
|
||||
u32 val;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
|
||||
return 0;
|
||||
}
|
||||
@ -694,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
|
||||
u32 value;
|
||||
ssize_t rc;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
|
||||
return 0;
|
||||
}
|
||||
@ -731,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
|
||||
ssize_t rc;
|
||||
u64 val;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
|
||||
return 0;
|
||||
}
|
||||
@ -767,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
|
||||
u64 value;
|
||||
ssize_t rc;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
|
||||
return 0;
|
||||
}
|
||||
@ -802,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
|
||||
ssize_t rc;
|
||||
u32 size;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n");
|
||||
return 0;
|
||||
}
|
||||
@ -1077,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
|
||||
u64 value;
|
||||
ssize_t rc;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev,
|
||||
"Can't change clock gating during reset\n");
|
||||
return 0;
|
||||
@ -1119,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
|
||||
u32 value;
|
||||
ssize_t rc;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset)) {
|
||||
if (hdev->reset_info.in_reset) {
|
||||
dev_warn_ratelimited(hdev->dev,
|
||||
"Can't change stop on error during reset\n");
|
||||
return 0;
|
||||
|
@ -17,7 +17,7 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
|
||||
{
|
||||
enum hl_device_status status;
|
||||
|
||||
if (atomic_read(&hdev->reset_info.in_reset))
|
||||
if (hdev->reset_info.in_reset)
|
||||
status = HL_DEVICE_STATUS_IN_RESET;
|
||||
else if (hdev->reset_info.needs_reset)
|
||||
status = HL_DEVICE_STATUS_NEEDS_RESET;
|
||||
@ -448,11 +448,11 @@ static int device_early_init(struct hl_device *hdev)
|
||||
mutex_init(&hdev->debug_lock);
|
||||
INIT_LIST_HEAD(&hdev->cs_mirror_list);
|
||||
spin_lock_init(&hdev->cs_mirror_lock);
|
||||
spin_lock_init(&hdev->reset_info.lock);
|
||||
INIT_LIST_HEAD(&hdev->fpriv_list);
|
||||
INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
|
||||
mutex_init(&hdev->fpriv_list_lock);
|
||||
mutex_init(&hdev->fpriv_ctrl_list_lock);
|
||||
atomic_set(&hdev->reset_info.in_reset, 0);
|
||||
mutex_init(&hdev->clk_throttling.lock);
|
||||
|
||||
return 0;
|
||||
@ -544,7 +544,7 @@ reschedule:
|
||||
* status for at least one heartbeat. From this point driver restarts
|
||||
* tracking future consecutive fatal errors.
|
||||
*/
|
||||
if (!(atomic_read(&hdev->reset_info.in_reset)))
|
||||
if (!hdev->reset_info.in_reset)
|
||||
hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
|
||||
|
||||
schedule_delayed_work(&hdev->work_heartbeat,
|
||||
@ -722,11 +722,14 @@ int hl_device_suspend(struct hl_device *hdev)
|
||||
pci_save_state(hdev->pdev);
|
||||
|
||||
/* Block future CS/VM/JOB completion operations */
|
||||
rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
|
||||
if (rc) {
|
||||
spin_lock(&hdev->reset_info.lock);
|
||||
if (hdev->reset_info.in_reset) {
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
dev_err(hdev->dev, "Can't suspend while in reset\n");
|
||||
return -EIO;
|
||||
}
|
||||
hdev->reset_info.in_reset = 1;
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
|
||||
/* This blocks all other stuff that is not blocked by in_reset */
|
||||
hdev->disabled = true;
|
||||
@ -776,8 +779,10 @@ int hl_device_resume(struct hl_device *hdev)
|
||||
}
|
||||
|
||||
|
||||
hdev->disabled = false;
|
||||
atomic_set(&hdev->reset_info.in_reset, 0);
|
||||
/* 'in_reset' was set to true during suspend, now we must clear it in order
|
||||
* for hard reset to be performed
|
||||
*/
|
||||
hdev->reset_info.in_reset = 0;
|
||||
|
||||
rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
|
||||
if (rc) {
|
||||
@ -1024,9 +1029,13 @@ do_reset:
|
||||
*/
|
||||
if (!from_hard_reset_thread) {
|
||||
/* Block future CS/VM/JOB completion operations */
|
||||
rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
|
||||
if (rc)
|
||||
spin_lock(&hdev->reset_info.lock);
|
||||
if (hdev->reset_info.in_reset) {
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
return 0;
|
||||
}
|
||||
hdev->reset_info.in_reset = 1;
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
|
||||
handle_reset_trigger(hdev, flags);
|
||||
|
||||
@ -1234,7 +1243,7 @@ kill_processes:
|
||||
}
|
||||
}
|
||||
|
||||
atomic_set(&hdev->reset_info.in_reset, 0);
|
||||
hdev->reset_info.in_reset = 0;
|
||||
hdev->reset_info.needs_reset = false;
|
||||
|
||||
dev_notice(hdev->dev, "Successfully finished resetting the device\n");
|
||||
@ -1272,7 +1281,7 @@ out_err:
|
||||
goto again;
|
||||
}
|
||||
|
||||
atomic_set(&hdev->reset_info.in_reset, 0);
|
||||
hdev->reset_info.in_reset = 0;
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -1583,6 +1592,7 @@ out_disabled:
|
||||
*/
|
||||
void hl_device_fini(struct hl_device *hdev)
|
||||
{
|
||||
bool device_in_reset;
|
||||
ktime_t timeout;
|
||||
u64 reset_sec;
|
||||
int i, rc;
|
||||
@ -1606,10 +1616,22 @@ void hl_device_fini(struct hl_device *hdev)
|
||||
*/
|
||||
|
||||
timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
|
||||
rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
|
||||
while (rc) {
|
||||
|
||||
spin_lock(&hdev->reset_info.lock);
|
||||
device_in_reset = !!hdev->reset_info.in_reset;
|
||||
if (!device_in_reset)
|
||||
hdev->reset_info.in_reset = 1;
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
|
||||
while (device_in_reset) {
|
||||
usleep_range(50, 200);
|
||||
rc = atomic_cmpxchg(&hdev->reset_info.in_reset, 0, 1);
|
||||
|
||||
spin_lock(&hdev->reset_info.lock);
|
||||
device_in_reset = !!hdev->reset_info.in_reset;
|
||||
if (!device_in_reset)
|
||||
hdev->reset_info.in_reset = 1;
|
||||
spin_unlock(&hdev->reset_info.lock);
|
||||
|
||||
if (ktime_compare(ktime_get(), timeout) > 0) {
|
||||
dev_crit(hdev->dev,
|
||||
"Failed to remove device because reset function did not finish\n");
|
||||
|
@ -2457,9 +2457,10 @@ struct last_error_session_info {
|
||||
|
||||
/**
|
||||
* struct hl_reset_info - holds current device reset information.
|
||||
* @in_reset: is device in reset flow.
|
||||
* @lock: lock to protect critical reset flows.
|
||||
* @soft_reset_cnt: number of soft reset since the driver was loaded.
|
||||
* @hard_reset_cnt: number of hard reset since the driver was loaded.
|
||||
* @in_reset: is device in reset flow.
|
||||
* @is_in_soft_reset: Device is currently in soft reset process.
|
||||
* @needs_reset: true if reset_on_lockup is false and device should be reset
|
||||
* due to lockup.
|
||||
@ -2474,9 +2475,10 @@ struct last_error_session_info {
|
||||
* complete instead.
|
||||
*/
|
||||
struct hl_reset_info {
|
||||
atomic_t in_reset;
|
||||
spinlock_t lock;
|
||||
u32 soft_reset_cnt;
|
||||
u32 hard_reset_cnt;
|
||||
u8 in_reset;
|
||||
u8 is_in_soft_reset;
|
||||
u8 needs_reset;
|
||||
u8 hard_reset_pending;
|
||||
|
Loading…
Reference in New Issue
Block a user