accel/habanalabs: print timestamp of last PQ heartbeat on EQ heartbeat failure

The test packet which is sent to FW for the PQ heartbeat is used also as
the trigger in FW to send the EQ heartbeat event.
Add the time of the last sent packet to the debug info which is printed
upon a EQ heartbeat failure.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
Signed-off-by: Ofir Bitton <obitton@habana.ai>
This commit is contained in:
Tomer Tayar 2024-05-01 15:10:59 +03:00 committed by Ofir Bitton
parent c4548eee53
commit 5cb97d74c3
3 changed files with 46 additions and 12 deletions

View File

@ -1062,11 +1062,28 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (device_id == hdev->pdev->device);
}
static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size,
bool is_pq_hb)
{
time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
: hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
struct tm tm;
if (!seconds)
return;
time64_to_tm(seconds, 0, &tm);
snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
}
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
struct asic_fixed_properties *prop = &hdev->asic_prop;
char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A";
if (!prop->cpucp_info.eq_health_check_supported)
return true;
@ -1074,13 +1091,17 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
if (!hdev->eq_heartbeat_received) {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true);
stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false);
dev_err(hdev->dev,
"Heartbeat events counter: %u, EQ CI: %u, PQ PI: %u, PQ CI: %u (%u)\n",
heartbeat_debug_info->heartbeat_event_counter,
"EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n",
hdev->event_queue.ci,
heartbeat_debug_info->heartbeat_event_counter,
eq_time_str,
hdev->kernel_queues[cpu_q_id].pi,
atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask);
atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
pq_time_str);
hl_eq_dump(hdev, &hdev->event_queue);
@ -1562,12 +1583,19 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
}
}
static void reset_heartbeat_debug_info(struct hl_device *hdev)
{
hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0;
hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0;
hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
}
static inline void device_heartbeat_schedule(struct hl_device *hdev)
{
if (!hdev->heartbeat)
return;
hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
reset_heartbeat_debug_info(hdev);
/*
* Before scheduling the heartbeat driver will check if eq event has received.
@ -2883,6 +2911,7 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq)
void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
hdev->heartbeat_debug_info.heartbeat_event_counter++;
hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
hdev->eq_heartbeat_received = true;
}

View File

@ -466,12 +466,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
} else {
struct hl_bd *bd = queue->kernel_address;
bd += hl_pi_2_offset(queue->pi);
bd += hl_pi_2_offset(pi);
dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n"
"Pkt info: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
tmp, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
queue->dram_bd);
"Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
queue->dram_bd);
}
hdev->device_cpu_disabled = true;
goto out;
@ -681,12 +681,10 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
int rc;
memset(&hb_pkt, 0, sizeof(hb_pkt));
hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
sizeof(hb_pkt), 0, &result);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result);
if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
return -EIO;
@ -697,6 +695,8 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
rc = -EIO;
}
hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds();
return rc;
}

View File

@ -3196,10 +3196,15 @@ struct hl_reset_info {
/**
* struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
* @last_pq_heartbeat_ts: timestamp of the last test packet that was sent to FW.
* This packet is the trigger in FW to send the EQ heartbeat event.
* @last_eq_heartbeat_ts: timestamp of the last EQ heartbeat event that was received from FW.
* @heartbeat_event_counter: number of heartbeat events received.
* @cpu_queue_id: used to read the queue pi/ci
*/
struct eq_heartbeat_debug_info {
time64_t last_pq_heartbeat_ts;
time64_t last_eq_heartbeat_ts;
u32 heartbeat_event_counter;
u32 cpu_queue_id;
};