habanalabs/gaudi: add razwi notify event

Each time razwi (read-only zero, write ignore) happens, besides
capturing its data, also notify the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Dani Liberman 2022-10-30 13:08:37 +02:00 committed by Oded Gabbay
parent 841cd2d765
commit cb5fb665f3
4 changed files with 31 additions and 18 deletions

View File

@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
num_of_engines * sizeof(u16));
hdev->captured_err_info.razwi.flags = flags;
}
void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags, u64 *event_mask)
{
hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
}
static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
{
struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;

View File

@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags);
void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags, u64 *event_mask);
void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
#ifdef CONFIG_DEBUG_FS

View File

@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
bool razwi, u64 *event_mask)
{
bool is_read = false, is_write = false;
u16 engine_id[2], num_of_razwi_eng = 0;
@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
num_of_razwi_eng = 1;
}
hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
event_mask);
}
}
@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
case GAUDI_EVENT_MMU_DERR:
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;
@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM1_SPI_0:
case GAUDI_EVENT_HBM2_SPI_0:
case GAUDI_EVENT_HBM3_SPI_0:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM1_SPI_1:
case GAUDI_EVENT_HBM2_SPI_1:
case GAUDI_EVENT_HBM3_SPI_1:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
* if the event is a TPC Assertion or a "real" TPC DEC.
*/
event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_TPC5_KRN_ERR:
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
fallthrough;
case GAUDI_EVENT_MMU_SERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_CPU_AXI_SPLITTER:
case GAUDI_EVENT_PSOC_AXI_DEC:
case GAUDI_EVENT_PSOC_PRSTN_FALL:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI_EVENT_MMU_PAGE_FAULT:
case GAUDI_EVENT_MMU_WR_PERM:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_NIC4_QM1:
case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_qman_err(hdev, event_type, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET);
break;
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_print_irq_info(hdev, event_type, true, &event_mask);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
goto reset_device;
@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_TPC6_BMON_SPMU:
case GAUDI_EVENT_TPC7_BMON_SPMU:
case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
break;
case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_sm_sei_info(hdev, event_type,
&eq_entry->sm_sei_data);
rc = hl_state_dump(hdev);
@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
break;
case GAUDI_EVENT_DEV_RESET_REQ:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;
case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;
case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;

View File

@ -721,6 +721,7 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@ -729,6 +730,7 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
/* Opcode for management ioctl
*