From 35d8944480ab892a725ed4edae08a675c600b1de Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 16 Nov 2022 17:27:26 +0200 Subject: [PATCH] habanalabs/gaudi2: remove duplicated event prints In order to reduce error log, we try to minimize the dumped rows while keeping all relevant error info. In addition we completely remove clock throttling debug logs. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 339 +++++++++++------------- 1 file changed, 149 insertions(+), 190 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index b8da2aa024ca..a187e1ea68ee 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -6804,38 +6804,37 @@ static inline bool is_info_event(u32 event) switch (event) { case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE: case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S ... GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E: + + /* return in case of NIC status event - these events are received periodically and not as + * an indication to an error. + */ + case GAUDI2_EVENT_CPU0_STATUS_NIC0_ENG0 ... GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1: return true; default: return false; } } -static void gaudi2_print_irq_info(struct hl_device *hdev, u16 event_type) +static void gaudi2_print_event(struct hl_device *hdev, u16 event_type, + bool ratelimited, const char *fmt, ...) { - char desc[64] = ""; - bool event_valid = false; + struct va_format vaf; + va_list args; - /* return in case of NIC status event - these events are received periodically and not as - * an indication to an error, thus not printed. - */ - if (event_type >= GAUDI2_EVENT_CPU0_STATUS_NIC0_ENG0 && - event_type <= GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1) - return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - if (gaudi2_irq_map_table[event_type].valid) { - snprintf(desc, sizeof(desc), gaudi2_irq_map_table[event_type].name); - event_valid = true; - } - - if (!event_valid) - snprintf(desc, sizeof(desc), "N/A"); - - if (is_info_event(event_type)) - dev_info_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", - event_type, desc); + if (ratelimited) + dev_err_ratelimited(hdev->dev, "%s: %pV\n", + gaudi2_irq_map_table[event_type].valid ? + gaudi2_irq_map_table[event_type].name : "N/A Event", &vaf); else - dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", - event_type, desc); + dev_err(hdev->dev, "%s: %pV\n", + gaudi2_irq_map_table[event_type].valid ? + gaudi2_irq_map_table[event_type].name : "N/A Event", &vaf); + + va_end(args); } static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, @@ -6848,7 +6847,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); memory_wrapper_idx = ecc_data->memory_wrapper_idx; - dev_err(hdev->dev, + gaudi2_print_event(hdev, event_type, !ecc_data->is_critical, "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.\n", ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical); @@ -6988,7 +6987,7 @@ static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 str gaudi2_print_last_pqes_on_err(hdev, qid_base, i, qman_base, false); } -static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *qm_name, +static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type, u64 qman_base, u32 qid_base) { u32 i, j, glbl_sts_val, arb_err_val, num_error_causes, error_count = 0; @@ -7015,11 +7014,11 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *qm for (j = 0 ; j < num_error_causes ; j++) if (glbl_sts_val & BIT(j)) { - dev_err_ratelimited(hdev->dev, "%s %s. err cause: %s\n", - qm_name, reg_desc, - i == QMAN_STREAMS ? - gaudi2_qman_lower_cp_error_cause[j] : - gaudi2_qman_error_cause[j]); + gaudi2_print_event(hdev, event_type, true, + "%s. err cause: %s", reg_desc, + i == QMAN_STREAMS ? + gaudi2_qman_lower_cp_error_cause[j] : + gaudi2_qman_error_cause[j]); error_count++; } @@ -7033,8 +7032,9 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *qm for (j = 0 ; j < GAUDI2_NUM_OF_QM_ARB_ERR_CAUSE ; j++) { if (arb_err_val & BIT(j)) { - dev_err_ratelimited(hdev->dev, "%s ARB_ERR. err cause: %s\n", - qm_name, gaudi2_qman_arb_error_cause[j]); + gaudi2_print_event(hdev, event_type, true, + "ARB_ERR. err cause: %s", + gaudi2_qman_arb_error_cause[j]); error_count++; } } @@ -7761,7 +7761,7 @@ clear: return error_count; } -static int _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base) +static int _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base, u16 event_type) { u32 i, sts_val, sts_clr_val = 0, error_count = 0; @@ -7769,8 +7769,8 @@ static int _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base) for (i = 0 ; i < GAUDI2_NUM_OF_QM_SEI_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, "QM SEI. err cause: %s\n", - gaudi2_qm_sei_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_qm_sei_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -7827,13 +7827,13 @@ static int gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, return 0; } - error_count = _gaudi2_handle_qm_sei_err(hdev, qman_base); + error_count = _gaudi2_handle_qm_sei_err(hdev, qman_base, event_type); /* There is a single event per NIC macro, so should check its both QMAN blocks */ if (event_type >= GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE && event_type <= GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE) error_count += _gaudi2_handle_qm_sei_err(hdev, - qman_base + NIC_QM_OFFSET); + qman_base + NIC_QM_OFFSET, event_type); /* check if RAZWI happened */ if (razwi_info) @@ -7846,7 +7846,6 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type) { u32 qid_base, error_count = 0; u64 qman_base; - char desc[32]; u8 index; switch (event_type) { @@ -7854,125 +7853,104 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type) index = event_type - GAUDI2_EVENT_TPC0_QM; qid_base = GAUDI2_QUEUE_ID_DCORE0_TPC_0_0 + index * QMAN_STREAMS; qman_base = mmDCORE0_TPC0_QM_BASE + index * DCORE_TPC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_TPC%d_QM", index); break; case GAUDI2_EVENT_TPC6_QM ... GAUDI2_EVENT_TPC11_QM: index = event_type - GAUDI2_EVENT_TPC6_QM; qid_base = GAUDI2_QUEUE_ID_DCORE1_TPC_0_0 + index * QMAN_STREAMS; qman_base = mmDCORE1_TPC0_QM_BASE + index * DCORE_TPC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_TPC%d_QM", index); break; case GAUDI2_EVENT_TPC12_QM ... GAUDI2_EVENT_TPC17_QM: index = event_type - GAUDI2_EVENT_TPC12_QM; qid_base = GAUDI2_QUEUE_ID_DCORE2_TPC_0_0 + index * QMAN_STREAMS; qman_base = mmDCORE2_TPC0_QM_BASE + index * DCORE_TPC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_TPC%d_QM", index); break; case GAUDI2_EVENT_TPC18_QM ... GAUDI2_EVENT_TPC23_QM: index = event_type - GAUDI2_EVENT_TPC18_QM; qid_base = GAUDI2_QUEUE_ID_DCORE3_TPC_0_0 + index * QMAN_STREAMS; qman_base = mmDCORE3_TPC0_QM_BASE + index * DCORE_TPC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_TPC%d_QM", index); break; case GAUDI2_EVENT_TPC24_QM: qid_base = GAUDI2_QUEUE_ID_DCORE0_TPC_6_0; qman_base = mmDCORE0_TPC6_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_TPC6_QM"); break; case GAUDI2_EVENT_MME0_QM: qid_base = GAUDI2_QUEUE_ID_DCORE0_MME_0_0; qman_base = mmDCORE0_MME_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_MME_QM"); break; case GAUDI2_EVENT_MME1_QM: qid_base = GAUDI2_QUEUE_ID_DCORE1_MME_0_0; qman_base = mmDCORE1_MME_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_MME_QM"); break; case GAUDI2_EVENT_MME2_QM: qid_base = GAUDI2_QUEUE_ID_DCORE2_MME_0_0; qman_base = mmDCORE2_MME_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_MME_QM"); break; case GAUDI2_EVENT_MME3_QM: qid_base = GAUDI2_QUEUE_ID_DCORE3_MME_0_0; qman_base = mmDCORE3_MME_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_MME_QM"); break; case GAUDI2_EVENT_HDMA0_QM: qid_base = GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0; qman_base = mmDCORE0_EDMA0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_EDMA0_QM"); break; case GAUDI2_EVENT_HDMA1_QM: qid_base = GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0; qman_base = mmDCORE0_EDMA1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_EDMA1_QM"); break; case GAUDI2_EVENT_HDMA2_QM: qid_base = GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0; qman_base = mmDCORE1_EDMA0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_EDMA0_QM"); break; case GAUDI2_EVENT_HDMA3_QM: qid_base = GAUDI2_QUEUE_ID_DCORE1_EDMA_1_0; qman_base = mmDCORE1_EDMA1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_EDMA1_QM"); break; case GAUDI2_EVENT_HDMA4_QM: qid_base = GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0; qman_base = mmDCORE2_EDMA0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_EDMA0_QM"); break; case GAUDI2_EVENT_HDMA5_QM: qid_base = GAUDI2_QUEUE_ID_DCORE2_EDMA_1_0; qman_base = mmDCORE2_EDMA1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_EDMA1_QM"); break; case GAUDI2_EVENT_HDMA6_QM: qid_base = GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0; qman_base = mmDCORE3_EDMA0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_EDMA0_QM"); break; case GAUDI2_EVENT_HDMA7_QM: qid_base = GAUDI2_QUEUE_ID_DCORE3_EDMA_1_0; qman_base = mmDCORE3_EDMA1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_EDMA1_QM"); break; case GAUDI2_EVENT_PDMA0_QM: qid_base = GAUDI2_QUEUE_ID_PDMA_0_0; qman_base = mmPDMA0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "PDMA0_QM"); break; case GAUDI2_EVENT_PDMA1_QM: qid_base = GAUDI2_QUEUE_ID_PDMA_1_0; qman_base = mmPDMA1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "PDMA1_QM"); break; case GAUDI2_EVENT_ROTATOR0_ROT0_QM: qid_base = GAUDI2_QUEUE_ID_ROT_0_0; qman_base = mmROT0_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "ROTATOR0_QM"); break; case GAUDI2_EVENT_ROTATOR1_ROT1_QM: qid_base = GAUDI2_QUEUE_ID_ROT_1_0; qman_base = mmROT1_QM_BASE; - snprintf(desc, ARRAY_SIZE(desc), "ROTATOR1_QM"); break; default: return 0; } - error_count = gaudi2_handle_qman_err_generic(hdev, desc, qman_base, qid_base); + error_count = gaudi2_handle_qman_err_generic(hdev, event_type, qman_base, qid_base); /* Handle EDMA QM SEI here because there is no AXI error response event for EDMA */ if (event_type >= GAUDI2_EVENT_HDMA2_QM && event_type <= GAUDI2_EVENT_HDMA5_QM) - error_count += _gaudi2_handle_qm_sei_err(hdev, qman_base); + error_count += _gaudi2_handle_qm_sei_err(hdev, qman_base, event_type); return error_count; } -static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev) +static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev, u16 event_type) { u32 i, sts_val, sts_clr_val = 0, error_count = 0; @@ -7980,8 +7958,8 @@ static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev) for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, "ARC SEI. err cause: %s\n", - gaudi2_arc_sei_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_arc_sei_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -7992,7 +7970,7 @@ static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev) return error_count; } -static int gaudi2_handle_cpu_sei_err(struct hl_device *hdev) +static int gaudi2_handle_cpu_sei_err(struct hl_device *hdev, u16 event_type) { u32 i, sts_val, sts_clr_val = 0, error_count = 0; @@ -8000,8 +7978,8 @@ static int gaudi2_handle_cpu_sei_err(struct hl_device *hdev) for (i = 0 ; i < GAUDI2_NUM_OF_CPU_SEI_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, "CPU SEI. err cause: %s\n", - gaudi2_cpu_sei_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_cpu_sei_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -8012,7 +7990,7 @@ static int gaudi2_handle_cpu_sei_err(struct hl_device *hdev) return error_count; } -static int gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, +static int gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, u16 event_type, struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, u64 *event_mask) { @@ -8022,8 +8000,8 @@ static int gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, for (i = 0 ; i < GAUDI2_NUM_OF_ROT_ERR_CAUSE ; i++) if (intr_cause_data & BIT(i)) { - dev_err_ratelimited(hdev->dev, "ROT%u. err cause: %s\n", - rot_index, guadi2_rot_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", guadi2_rot_error_cause[i]); error_count++; } @@ -8034,7 +8012,7 @@ static int gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, return error_count; } -static int gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char *interrupt_name, +static int gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, u16 event_type, struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, u64 *event_mask) { @@ -8044,8 +8022,8 @@ static int gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char for (i = 0 ; i < GAUDI2_NUM_OF_TPC_INTR_CAUSE ; i++) if (intr_cause_data & BIT(i)) { - dev_err_ratelimited(hdev->dev, "TPC%d_%s interrupt cause: %s\n", - tpc_index, interrupt_name, gaudi2_tpc_interrupts_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "interrupt cause: %s", gaudi2_tpc_interrupts_cause[i]); error_count++; } @@ -8056,7 +8034,7 @@ static int gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char return error_count; } -static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const char *interrupt_name, +static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, u16 event_type, struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_val = 0, error_count = 0; @@ -8076,8 +8054,8 @@ static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const cha for (i = 0 ; i < GAUDI2_NUM_OF_DEC_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, "DEC%u_%s err cause: %s\n", - dec_index, interrupt_name, gaudi2_dec_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_dec_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -8093,7 +8071,7 @@ static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const cha return error_count; } -static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const char *interrupt_name, +static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, u16 event_type, struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0, error_count = 0; @@ -8106,8 +8084,8 @@ static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const cha for (i = 0 ; i < GAUDI2_NUM_OF_MME_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, "MME%u_%s err cause: %s\n", - mme_index, interrupt_name, guadi2_mme_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", guadi2_mme_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -8123,22 +8101,22 @@ static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const cha return error_count; } -static int gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u8 mme_index, u8 sbte_index, +static int gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u16 event_type, u64 intr_cause_data) { int i, error_count = 0; for (i = 0 ; i < GAUDI2_NUM_OF_MME_SBTE_ERR_CAUSE ; i++) if (intr_cause_data & BIT(i)) { - dev_err_ratelimited(hdev->dev, "MME%uSBTE%u_AXI_ERR_RSP err cause: %s\n", - mme_index, sbte_index, guadi2_mme_sbte_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", guadi2_mme_sbte_error_cause[i]); error_count++; } return error_count; } -static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, +static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, u16 event_type, struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0, error_count = 0; @@ -8151,9 +8129,8 @@ static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, for (i = 0 ; i < GAUDI2_NUM_OF_MME_WAP_ERR_CAUSE ; i++) { if (sts_val & BIT(i)) { - dev_err_ratelimited(hdev->dev, - "MME%u_WAP_SOURCE_RESULT_INVALID err cause: %s\n", - mme_index, guadi2_mme_wap_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", guadi2_mme_wap_error_cause[i]); sts_clr_val |= BIT(i); error_count++; } @@ -8170,7 +8147,8 @@ static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, return error_count; } -static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u64 intr_cause_data) +static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u16 event_type, + u64 intr_cause_data) { u32 error_count = 0; int i; @@ -8182,23 +8160,24 @@ static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u64 intr_cause_ */ for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++) if (intr_cause_data & BIT(i)) { - dev_err_ratelimited(hdev->dev, "kdma core err cause: %s\n", - gaudi2_kdma_core_interrupts_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_kdma_core_interrupts_cause[i]); error_count++; } return error_count; } -static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u64 intr_cause_data) +static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, + u64 intr_cause_data) { u32 error_count = 0; int i; for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++) if (intr_cause_data & BIT(i)) { - dev_err_ratelimited(hdev->dev, "dma core err cause: %s\n", - gaudi2_dma_core_interrupts_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_dma_core_interrupts_cause[i]); error_count++; } @@ -8238,8 +8217,8 @@ static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, } } -static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data, - u64 *event_mask) +static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_type, + u64 intr_cause_data, u64 *event_mask) { u32 error_count = 0; int i; @@ -8248,8 +8227,8 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_caus if (!(intr_cause_data & BIT_ULL(i))) continue; - dev_err_ratelimited(hdev->dev, "PCIE ADDR DEC Error: %s\n", - gaudi2_pcie_addr_dec_error_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]); error_count++; switch (intr_cause_data & BIT_ULL(i)) { @@ -8264,7 +8243,8 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_caus return error_count; } -static int gaudi2_handle_pif_fatal(struct hl_device *hdev, u64 intr_cause_data) +static int gaudi2_handle_pif_fatal(struct hl_device *hdev, u16 event_type, + u64 intr_cause_data) { u32 error_count = 0; @@ -8272,8 +8252,8 @@ static int gaudi2_handle_pif_fatal(struct hl_device *hdev, u64 intr_cause_data) for (i = 0 ; i < GAUDI2_NUM_OF_PMMU_FATAL_ERR_CAUSE ; i++) { if (intr_cause_data & BIT_ULL(i)) { - dev_err_ratelimited(hdev->dev, "PMMU PIF err cause: %s\n", - gaudi2_pmmu_fatal_interrupts_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_pmmu_fatal_interrupts_cause[i]); error_count++; } } @@ -8283,16 +8263,13 @@ static int gaudi2_handle_pif_fatal(struct hl_device *hdev, u64 intr_cause_data) static int gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 intr_cause_data) { - u32 dcore_id, hif_id, error_count = 0; + u32 error_count = 0; int i; - dcore_id = (event_type - GAUDI2_EVENT_HIF0_FATAL) / 4; - hif_id = (event_type - GAUDI2_EVENT_HIF0_FATAL) % 4; - for (i = 0 ; i < GAUDI2_NUM_OF_HIF_FATAL_ERR_CAUSE ; i++) { if (intr_cause_data & BIT_ULL(i)) { - dev_err_ratelimited(hdev->dev, "DCORE%u_HIF%u: %s\n", dcore_id, hif_id, - gaudi2_hif_fatal_interrupts_cause[i]); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_hif_fatal_interrupts_cause[i]); error_count++; } } @@ -8343,7 +8320,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0); } -static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char *mmu_name, +static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type, u64 mmu_base, bool is_pmmu, u64 *event_mask) { u32 spi_sei_cause, interrupt_clr = 0x0, error_count = 0; @@ -8353,8 +8330,8 @@ static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char for (i = 0 ; i < GAUDI2_NUM_OF_MMU_SPI_SEI_CAUSE ; i++) { if (spi_sei_cause & BIT(i)) { - dev_err_ratelimited(hdev->dev, "%s SPI_SEI ERR. err cause: %s\n", - mmu_name, gaudi2_mmu_spi_sei[i].cause); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", gaudi2_mmu_spi_sei[i].cause); if (i == 0) gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, event_mask); @@ -8377,7 +8354,7 @@ static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char return error_count; } -static int gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) +static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_index) { u32 sei_cause_addr, sei_cause_val, sei_cause_cause, sei_cause_log, cq_intr_addr, cq_intr_val, cq_intr_queue_index, error_count = 0; @@ -8400,11 +8377,11 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) if (!(sei_cause_cause & BIT(i))) continue; - dev_err_ratelimited(hdev->dev, "SM%u SEI ERR. err cause: %s. %s: 0x%X\n", - sm_index, - gaudi2_sm_sei_cause[i].cause_name, - gaudi2_sm_sei_cause[i].log_name, - sei_cause_log & gaudi2_sm_sei_cause[i].log_mask); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s. %s: 0x%X\n", + gaudi2_sm_sei_cause[i].cause_name, + gaudi2_sm_sei_cause[i].log_name, + sei_cause_log & gaudi2_sm_sei_cause[i].log_mask); error_count++; break; } @@ -8433,7 +8410,6 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) { bool is_pmmu = false; - char desc[32]; u32 error_count = 0; u64 mmu_base; u8 index; @@ -8442,54 +8418,46 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3; mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_HMMU%d", index); break; case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP); mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE0_HMMU%d", index); break; case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3; mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_HMMU%d", index); break; case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP); mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE1_HMMU%d", index); break; case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3; mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_HMMU%d", index); break; case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP); mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE2_HMMU%d", index); break; case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3; mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_HMMU%d", index); break; case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP); mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DCORE3_HMMU%d", index); break; case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: is_pmmu = true; mmu_base = mmPMMU_HBW_MMU_BASE; - snprintf(desc, ARRAY_SIZE(desc), "PMMU"); break; default: return 0; } - error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu, event_mask); + error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base, + is_pmmu, event_mask); return error_count; } @@ -8611,22 +8579,17 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type, cause_idx = sei_data->hdr.sei_cause; if (cause_idx > GAUDI2_NUM_OF_HBM_SEI_CAUSE - 1) { - dev_err_ratelimited(hdev->dev, "Invalid HBM SEI event cause (%d) provided by FW\n", - cause_idx); + gaudi2_print_event(hdev, event_type, true, + "err cause: %s", + "Invalid HBM SEI event cause (%d) provided by FW\n", cause_idx); return true; } - if (sei_data->hdr.is_critical) - dev_err(hdev->dev, - "System Critical Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n", - hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel, - hbm_mc_sei_cause[cause_idx]); - - else - dev_err_ratelimited(hdev->dev, - "System Non-Critical Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n", - hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel, - hbm_mc_sei_cause[cause_idx]); + gaudi2_print_event(hdev, event_type, !sei_data->hdr.is_critical, + "System %s Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n", + sei_data->hdr.is_critical ? "Critical" : "Non-critical", + hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel, + hbm_mc_sei_cause[cause_idx]); /* Print error-specific info */ switch (cause_idx) { @@ -8670,12 +8633,12 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type, return require_hard_reset; } -static int gaudi2_handle_hbm_cattrip(struct hl_device *hdev, u64 intr_cause_data) +static int gaudi2_handle_hbm_cattrip(struct hl_device *hdev, u16 event_type, + u64 intr_cause_data) { if (intr_cause_data) { - dev_err(hdev->dev, - "HBM catastrophic temperature error (CATTRIP) cause %#llx\n", - intr_cause_data); + gaudi2_print_event(hdev, event_type, true, + "temperature error cause: %#llx", intr_cause_data); return 1; } @@ -8708,13 +8671,13 @@ static void gaudi2_print_clk_change_info(struct hl_device *hdev, u16 event_type, hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER; hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; - dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); + dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; case GAUDI2_EVENT_CPU_FIX_POWER_ENV_E: hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER; hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); - dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); + dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_S: @@ -8741,16 +8704,18 @@ static void gaudi2_print_clk_change_info(struct hl_device *hdev, u16 event_type, mutex_unlock(&hdev->clk_throttling.lock); } -static void gaudi2_print_out_of_sync_info(struct hl_device *hdev, +static void gaudi2_print_out_of_sync_info(struct hl_device *hdev, u16 event_type, struct cpucp_pkt_sync_err *sync_err) { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", - le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); + gaudi2_print_event(hdev, event_type, false, + "FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), + q->pi, atomic_read(&q->ci)); } -static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev) +static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev, u16 event_type) { u32 p2p_intr, msix_gw_intr, error_count = 0; @@ -8758,7 +8723,7 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev) msix_gw_intr = RREG32(mmPCIE_WRAP_MSIX_GW_INTR); if (p2p_intr) { - dev_err_ratelimited(hdev->dev, + gaudi2_print_event(hdev, event_type, true, "pcie p2p transaction terminated due to security, req_id(0x%x)\n", RREG32(mmPCIE_WRAP_P2P_REQ_ID)); @@ -8767,7 +8732,7 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev) } if (msix_gw_intr) { - dev_err_ratelimited(hdev->dev, + gaudi2_print_event(hdev, event_type, true, "pcie msi-x gen denied due to vector num check failure, vec(0x%X)\n", RREG32(mmPCIE_WRAP_MSIX_GW_VEC)); @@ -8822,17 +8787,18 @@ static int gaudi2_handle_psoc_drain(struct hl_device *hdev, u64 intr_cause_data) return error_count; } -static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev, +static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev, u16 event_type, struct cpucp_pkt_sync_err *sync_err) { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; - dev_warn(hdev->dev, + gaudi2_print_event(hdev, event_type, false, "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } -static int hl_arc_event_handle(struct hl_device *hdev, struct hl_eq_engine_arc_intr_data *data) +static int hl_arc_event_handle(struct hl_device *hdev, u16 event_type, + struct hl_eq_engine_arc_intr_data *data) { struct hl_engine_arc_dccm_queue_full_irq *q; u32 intr_type, engine_id; @@ -8846,12 +8812,12 @@ static int hl_arc_event_handle(struct hl_device *hdev, struct hl_eq_engine_arc_i case ENGINE_ARC_DCCM_QUEUE_FULL_IRQ: q = (struct hl_engine_arc_dccm_queue_full_irq *) &payload; - dev_err_ratelimited(hdev->dev, + gaudi2_print_event(hdev, event_type, true, "ARC DCCM Full event: EngId: %u, Intr_type: %u, Qidx: %u\n", engine_id, intr_type, q->queue_index); return 1; default: - dev_err_ratelimited(hdev->dev, "Unknown ARC event type\n"); + gaudi2_print_event(hdev, event_type, true, "Unknown ARC event type\n"); return 0; } } @@ -8860,8 +8826,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent { struct gaudi2_device *gaudi2 = hdev->asic_specific; bool reset_required = false, is_critical = false; - u32 ctl, reset_flags = HL_DRV_RESET_HARD, error_count = 0; - int index, sbte_index; + u32 index, ctl, reset_flags = HL_DRV_RESET_HARD, error_count = 0; u64 event_mask = 0; u16 event_type; @@ -8877,8 +8842,6 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent gaudi2->events_stat[event_type]++; gaudi2->events_stat_aggregate[event_type]++; - gaudi2_print_irq_info(hdev, event_type); - switch (event_type) { case GAUDI2_EVENT_PCIE_CORE_SERR ... GAUDI2_EVENT_ARC0_ECC_DERR: fallthrough; @@ -8901,12 +8864,12 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0: reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; - error_count = gaudi2_handle_arc_farm_sei_err(hdev); + error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_CPU_AXI_ERR_RSP: - error_count = gaudi2_handle_cpu_sei_err(hdev); + error_count = gaudi2_handle_cpu_sei_err(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -8921,7 +8884,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE; - error_count = gaudi2_handle_rot_err(hdev, index, + error_count = gaudi2_handle_rot_err(hdev, index, event_type, &eq_entry->razwi_with_intr_cause, &event_mask); error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; @@ -8929,7 +8892,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP: index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP; - error_count = gaudi2_tpc_ack_interrupts(hdev, index, "AXI_ERR_RSP", + error_count = gaudi2_tpc_ack_interrupts(hdev, index, event_type, &eq_entry->razwi_with_intr_cause, &event_mask); error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; @@ -8937,8 +8900,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE: index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE; - error_count = gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", - &eq_entry->razwi_info, &event_mask); + error_count = gaudi2_handle_dec_err(hdev, index, event_type, + &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8969,8 +8932,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_TPC24_KERNEL_ERR: index = (event_type - GAUDI2_EVENT_TPC0_KERNEL_ERR) / (GAUDI2_EVENT_TPC1_KERNEL_ERR - GAUDI2_EVENT_TPC0_KERNEL_ERR); - error_count = gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", - &eq_entry->razwi_with_intr_cause, &event_mask); + error_count = gaudi2_tpc_ack_interrupts(hdev, index, event_type, + &eq_entry->razwi_with_intr_cause, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8986,7 +8949,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_DEC9_SPI: index = (event_type - GAUDI2_EVENT_DEC0_SPI) / (GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI); - error_count = gaudi2_handle_dec_err(hdev, index, "SPI", + error_count = gaudi2_handle_dec_err(hdev, index, event_type, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8998,8 +8961,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE) / (GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE); - error_count = gaudi2_handle_mme_err(hdev, index, - "CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info, &event_mask); + error_count = gaudi2_handle_mme_err(hdev, index, event_type, + &eq_entry->razwi_info, &event_mask); error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9011,7 +8974,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_QMAN_SW_ERROR) / (GAUDI2_EVENT_MME1_QMAN_SW_ERROR - GAUDI2_EVENT_MME0_QMAN_SW_ERROR); - error_count = gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", + error_count = gaudi2_handle_mme_err(hdev, index, event_type, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9023,26 +8986,26 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID) / (GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID); - error_count = gaudi2_handle_mme_wap_err(hdev, index, + error_count = gaudi2_handle_mme_wap_err(hdev, index, event_type, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_KDMA0_CORE: - error_count = gaudi2_handle_kdma_core_event(hdev, + error_count = gaudi2_handle_kdma_core_event(hdev, event_type, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_PDMA1_CORE: - error_count = gaudi2_handle_dma_core_event(hdev, + error_count = gaudi2_handle_dma_core_event(hdev, event_type, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR: - error_count = gaudi2_print_pcie_addr_dec_info(hdev, + error_count = gaudi2_print_pcie_addr_dec_info(hdev, event_type, le64_to_cpu(eq_entry->intr_cause.intr_cause_data), &event_mask); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; @@ -9065,7 +9028,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PMMU_FATAL_0: - error_count = gaudi2_handle_pif_fatal(hdev, + error_count = gaudi2_handle_pif_fatal(hdev, event_type, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; @@ -9086,7 +9049,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_HBM_CATTRIP_0 ... GAUDI2_EVENT_HBM_CATTRIP_5: - error_count = gaudi2_handle_hbm_cattrip(hdev, + error_count = gaudi2_handle_hbm_cattrip(hdev, event_type, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9122,14 +9085,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_SBTE4_AXI_ERR_RSP: case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_SBTE4_AXI_ERR_RSP: case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_SBTE4_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP) / - (GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP - - GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP); - sbte_index = (event_type - GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP) % - (GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP - - GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP); - error_count = gaudi2_handle_mme_sbte_err(hdev, index, sbte_index, - le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + error_count = gaudi2_handle_mme_sbte_err(hdev, event_type, + le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_VM0_ALARM_A ... GAUDI2_EVENT_VM3_ALARM_B: @@ -9217,7 +9174,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC: - gaudi2_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); + gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9229,13 +9186,13 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PCIE_P2P_MSIX: - error_count = gaudi2_handle_pcie_p2p_msix(hdev); + error_count = gaudi2_handle_pcie_p2p_msix(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_SM3_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE; - error_count = gaudi2_handle_sm_err(hdev, index); + error_count = gaudi2_handle_sm_err(hdev, event_type, index); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9258,13 +9215,13 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED: - gaudi2_print_cpu_pkt_failure_info(hdev, &eq_entry->pkt_sync_err); + gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_ARC_DCCM_FULL: - error_count = hl_arc_event_handle(hdev, &eq_entry->arc_data); + error_count = hl_arc_event_handle(hdev, event_type, &eq_entry->arc_data); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9286,7 +9243,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent * Note that although we have counted the errors, we use this number as * a boolean. */ - if (error_count == 0 && !is_info_event(event_type)) + if (error_count == GAUDI2_NA_EVENT_CAUSE && !is_info_event(event_type)) + gaudi2_print_event(hdev, event_type, true, "%d", event_type); + else if (error_count == 0) dev_err_ratelimited(hdev->dev, "No Error cause for H/W event %d\n", event_type);