mirror of
https://github.com/torvalds/linux.git
synced 2024-12-26 21:02:19 +00:00
habanalabs/gaudi: add razwi notify event
Each time razwi (read-only zero, write ignore) happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman <dliberman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
841cd2d765
commit
cb5fb665f3
@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
|
||||
num_of_engines * sizeof(u16));
|
||||
hdev->captured_err_info.razwi.flags = flags;
|
||||
}
|
||||
|
||||
void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
|
||||
u8 flags, u64 *event_mask)
|
||||
{
|
||||
hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
|
||||
*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
|
||||
}
|
||||
|
||||
static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
|
||||
{
|
||||
struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;
|
||||
|
@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
|
||||
__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
|
||||
void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
|
||||
u8 flags);
|
||||
void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
|
||||
u8 flags, u64 *event_mask);
|
||||
void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
|
@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
|
||||
}
|
||||
|
||||
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
|
||||
bool razwi)
|
||||
bool razwi, u64 *event_mask)
|
||||
{
|
||||
bool is_read = false, is_write = false;
|
||||
u16 engine_id[2], num_of_razwi_eng = 0;
|
||||
@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
|
||||
num_of_razwi_eng = 1;
|
||||
}
|
||||
|
||||
hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
|
||||
hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
|
||||
event_mask);
|
||||
}
|
||||
}
|
||||
|
||||
@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
|
||||
case GAUDI_EVENT_MMU_DERR:
|
||||
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
|
||||
@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_AXI_ECC:
|
||||
case GAUDI_EVENT_L2_RAM_ECC:
|
||||
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
goto reset_device;
|
||||
@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_HBM1_SPI_0:
|
||||
case GAUDI_EVENT_HBM2_SPI_0:
|
||||
case GAUDI_EVENT_HBM3_SPI_0:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
gaudi_hbm_read_interrupts(hdev,
|
||||
gaudi_hbm_event_to_dev(event_type),
|
||||
&eq_entry->hbm_ecc_data);
|
||||
@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_HBM1_SPI_1:
|
||||
case GAUDI_EVENT_HBM2_SPI_1:
|
||||
case GAUDI_EVENT_HBM3_SPI_1:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
gaudi_hbm_read_interrupts(hdev,
|
||||
gaudi_hbm_event_to_dev(event_type),
|
||||
&eq_entry->hbm_ecc_data);
|
||||
@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
* if the event is a TPC Assertion or a "real" TPC DEC.
|
||||
*/
|
||||
event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
reset_required = gaudi_tpc_read_interrupts(hdev,
|
||||
tpc_dec_event_to_tpc_id(event_type),
|
||||
"AXI_SLV_DEC_Error");
|
||||
@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_TPC5_KRN_ERR:
|
||||
case GAUDI_EVENT_TPC6_KRN_ERR:
|
||||
case GAUDI_EVENT_TPC7_KRN_ERR:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
reset_required = gaudi_tpc_read_interrupts(hdev,
|
||||
tpc_krn_event_to_tpc_id(event_type),
|
||||
"KRN_ERR");
|
||||
@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
|
||||
fallthrough;
|
||||
case GAUDI_EVENT_MMU_SERR:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_CPU_AXI_SPLITTER:
|
||||
case GAUDI_EVENT_PSOC_AXI_DEC:
|
||||
case GAUDI_EVENT_PSOC_PRSTN_FALL:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_MMU_PAGE_FAULT:
|
||||
case GAUDI_EVENT_MMU_WR_PERM:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||
break;
|
||||
@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_NIC4_QM1:
|
||||
case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
|
||||
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
gaudi_handle_qman_err(hdev, event_type, &event_mask);
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET);
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
|
||||
gaudi_print_irq_info(hdev, event_type, true);
|
||||
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
|
||||
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||
goto reset_device;
|
||||
|
||||
@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
case GAUDI_EVENT_TPC6_BMON_SPMU:
|
||||
case GAUDI_EVENT_TPC7_BMON_SPMU:
|
||||
case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
hl_fw_unmask_irq(hdev, event_type);
|
||||
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||
break;
|
||||
@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
gaudi_print_sm_sei_info(hdev, event_type,
|
||||
&eq_entry->sm_sei_data);
|
||||
rc = hl_state_dump(hdev);
|
||||
@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_DEV_RESET_REQ:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
goto reset_device;
|
||||
|
||||
case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
goto reset_device;
|
||||
|
||||
case GAUDI_EVENT_FW_ALIVE_S:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
|
||||
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
|
||||
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
|
||||
goto reset_device;
|
||||
|
@ -721,6 +721,7 @@ enum hl_server_type {
|
||||
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
|
||||
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
|
||||
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
|
||||
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
|
||||
*/
|
||||
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
|
||||
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
|
||||
@ -729,6 +730,7 @@ enum hl_server_type {
|
||||
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
|
||||
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
|
||||
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
|
||||
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
|
||||
|
||||
/* Opcode for management ioctl
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user