accel/habanalabs: add info ioctl for engine error reports

User gets notification for every engine error report, but he still
lacks the exact engine information. Hence, we allow user to query
for the exact engine reported an error.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Ofir Bitton 2023-05-23 10:42:19 +03:00 committed by Oded Gabbay
parent 10926f6005
commit a8ab1a81cc
5 changed files with 240 additions and 0 deletions

View File

@ -2701,6 +2701,20 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
}
void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count)
{
struct engine_err_info *info = &hdev->captured_err_info.engine_err;
/* Capture only the first engine error */
if (atomic_cmpxchg(&info->event_detected, 0, 1))
return;
info->event.timestamp = ktime_to_ns(ktime_get());
info->event.engine_id = engine_id;
info->event.error_count = error_count;
info->event_info_available = true;
}
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
{
vfree(captured_err_info->page_fault_info.user_mappings);

View File

@ -3062,6 +3062,20 @@ struct fw_err_info {
bool event_info_available;
};
/**
* struct engine_err_info - engine error information.
* @event: holds information on the event.
* @event_detected: if set as 1, then an engine event was discovered for the
* first time after the driver has finished booting-up.
* @event_info_available: indicates that an engine event info is now available.
*/
struct engine_err_info {
struct hl_info_engine_err_event event;
atomic_t event_detected;
bool event_info_available;
};
/**
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
@ -3070,6 +3084,7 @@ struct fw_err_info {
* @page_fault_info: page fault information.
* @hw_err: (fatal) hardware error information.
* @fw_err: firmware error information.
* @engine_err: engine error information.
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
@ -3078,6 +3093,7 @@ struct hl_error_info {
struct page_fault_info page_fault_info;
struct hw_err_info hw_err;
struct fw_err_info fw_err;
struct engine_err_info engine_err;
};
/**
@ -3952,6 +3968,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
u64 *event_mask);
void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
#ifdef CONFIG_DEBUG_FS

View File

@ -875,6 +875,28 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return rc ? -EFAULT : 0;
}
static int engine_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
struct hl_device *hdev = hpriv->hdev;
u32 user_buf_size = args->return_size;
struct engine_err_info *info;
int rc;
if (!user_buf)
return -EINVAL;
info = &hdev->captured_err_info.engine_err;
if (!info->event_info_available)
return 0;
if (user_buf_size < sizeof(struct hl_info_engine_err_event))
return -ENOMEM;
rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_engine_err_event));
return rc ? -EFAULT : 0;
}
static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
{
void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@ -1001,6 +1023,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_FW_ERR_EVENT:
return fw_err_info(hpriv, args);
case HL_INFO_USER_ENGINE_ERR_EVENT:
return engine_err_info(hpriv, args);
case HL_INFO_DRAM_USAGE:
return dram_usage_info(hpriv, args);
default:

View File

@ -9589,6 +9589,171 @@ static int hl_arc_event_handle(struct hl_device *hdev, u16 event_type,
}
}
static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
{
enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX;
u16 index;
switch (event_type) {
case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
type = GAUDI2_BLOCK_TYPE_TPC;
break;
case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM:
index = event_type - GAUDI2_EVENT_TPC0_QM;
type = GAUDI2_BLOCK_TYPE_TPC;
break;
case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_MME0_SPI_BASE ... GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
case GAUDI2_EVENT_MME0_QM:
index = 0;
type = GAUDI2_BLOCK_TYPE_MME;
break;
case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_MME1_SPI_BASE ... GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID:
case GAUDI2_EVENT_MME1_QM:
index = 1;
type = GAUDI2_BLOCK_TYPE_MME;
break;
case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_MME2_SPI_BASE ... GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID:
case GAUDI2_EVENT_MME2_QM:
index = 2;
type = GAUDI2_BLOCK_TYPE_MME;
break;
case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_MME3_SPI_BASE ... GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID:
case GAUDI2_EVENT_MME3_QM:
index = 3;
type = GAUDI2_BLOCK_TYPE_MME;
break;
case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
case GAUDI2_EVENT_KDMA_BM_SPMU:
case GAUDI2_EVENT_KDMA0_CORE:
return GAUDI2_ENGINE_ID_KDMA;
case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
case GAUDI2_EVENT_PDMA0_CORE:
case GAUDI2_EVENT_PDMA0_BM_SPMU:
case GAUDI2_EVENT_PDMA0_QM:
return GAUDI2_ENGINE_ID_PDMA_0;
case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
case GAUDI2_EVENT_PDMA1_CORE:
case GAUDI2_EVENT_PDMA1_BM_SPMU:
case GAUDI2_EVENT_PDMA1_QM:
return GAUDI2_ENGINE_ID_PDMA_1;
case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
type = GAUDI2_BLOCK_TYPE_DEC;
break;
case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU:
index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1;
type = GAUDI2_BLOCK_TYPE_DEC;
break;
case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE:
index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE;
return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
index = event_type - GAUDI2_EVENT_NIC0_QM0;
return GAUDI2_ENGINE_ID_NIC0_0 + index;
case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR:
index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU;
return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2);
case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR:
index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1;
type = GAUDI2_BLOCK_TYPE_TPC;
break;
case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_ROTATOR0_BMON_SPMU:
case GAUDI2_EVENT_ROTATOR0_ROT0_QM:
return GAUDI2_ENGINE_ID_ROT_0;
case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE:
case GAUDI2_EVENT_ROTATOR1_BMON_SPMU:
case GAUDI2_EVENT_ROTATOR1_ROT1_QM:
return GAUDI2_ENGINE_ID_ROT_1;
case GAUDI2_EVENT_HDMA0_BM_SPMU:
case GAUDI2_EVENT_HDMA0_QM:
case GAUDI2_EVENT_HDMA0_CORE:
return GAUDI2_DCORE0_ENGINE_ID_EDMA_0;
case GAUDI2_EVENT_HDMA1_BM_SPMU:
case GAUDI2_EVENT_HDMA1_QM:
case GAUDI2_EVENT_HDMA1_CORE:
return GAUDI2_DCORE0_ENGINE_ID_EDMA_1;
case GAUDI2_EVENT_HDMA2_BM_SPMU:
case GAUDI2_EVENT_HDMA2_QM:
case GAUDI2_EVENT_HDMA2_CORE:
return GAUDI2_DCORE1_ENGINE_ID_EDMA_0;
case GAUDI2_EVENT_HDMA3_BM_SPMU:
case GAUDI2_EVENT_HDMA3_QM:
case GAUDI2_EVENT_HDMA3_CORE:
return GAUDI2_DCORE1_ENGINE_ID_EDMA_1;
case GAUDI2_EVENT_HDMA4_BM_SPMU:
case GAUDI2_EVENT_HDMA4_QM:
case GAUDI2_EVENT_HDMA4_CORE:
return GAUDI2_DCORE2_ENGINE_ID_EDMA_0;
case GAUDI2_EVENT_HDMA5_BM_SPMU:
case GAUDI2_EVENT_HDMA5_QM:
case GAUDI2_EVENT_HDMA5_CORE:
return GAUDI2_DCORE2_ENGINE_ID_EDMA_1;
case GAUDI2_EVENT_HDMA6_BM_SPMU:
case GAUDI2_EVENT_HDMA6_QM:
case GAUDI2_EVENT_HDMA6_CORE:
return GAUDI2_DCORE3_ENGINE_ID_EDMA_0;
case GAUDI2_EVENT_HDMA7_BM_SPMU:
case GAUDI2_EVENT_HDMA7_QM:
case GAUDI2_EVENT_HDMA7_CORE:
return GAUDI2_DCORE3_ENGINE_ID_EDMA_1;
default:
break;
}
switch (type) {
case GAUDI2_BLOCK_TYPE_TPC:
switch (index) {
case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5:
return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index;
case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5:
return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE1_TPC0;
case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5:
return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE2_TPC0;
case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5:
return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE3_TPC0;
default:
break;
}
break;
case GAUDI2_BLOCK_TYPE_MME:
switch (index) {
case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME;
case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME;
case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME;
case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME;
default:
break;
}
break;
case GAUDI2_BLOCK_TYPE_DEC:
switch (index) {
case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0;
case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1;
case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0;
case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1;
case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0;
case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1;
case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0;
case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1;
case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0;
case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1;
default:
break;
}
break;
default:
break;
}
return U16_MAX;
}
static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi2_device *gaudi2 = hdev->asic_specific;
@ -10011,6 +10176,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
}
}
if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR)
hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, event_type), error_count);
/* Make sure to dump an error in case no error cause was printed so far.
* Note that although we have counted the errors, we use this number as
* a boolean.

View File

@ -809,6 +809,7 @@ enum hl_server_type {
* HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error.
* May return 0 even though no new data is available, in that case
* timestamp will be 0.
* HL_INFO_USER_ENGINE_ERR_EVENT - Retrieve the last engine id that reported an error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@ -845,6 +846,7 @@ enum hl_server_type {
#define HL_INFO_FW_GENERIC_REQ 35
#define HL_INFO_HW_ERR_EVENT 36
#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_USER_ENGINE_ERR_EVENT 38
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@ -1226,6 +1228,20 @@ struct hl_info_fw_err_event {
__u32 pad;
};
/**
* struct hl_info_engine_err_event - engine error info
* @timestamp: time-stamp of error occurrence
* @engine_id: engine id who reported the error.
* @error_count: Amount of errors reported.
* @pad: size padding for u64 granularity.
*/
struct hl_info_engine_err_event {
__s64 timestamp;
__u16 engine_id;
__u16 error_count;
__u32 pad;
};
/**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size