mirror of
https://github.com/torvalds/linux.git
synced 2025-01-01 15:51:46 +00:00
habanalabs: don't notify user about clk throttling due to power
As clock throttling due to high power consumption can happen very frequently and there is no real reason to notify the user about it, we skip this notification in all asics. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
ce259804d2
commit
6710444cfe
@ -7584,7 +7584,7 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
|
|||||||
return (tpc_dec_event_type - GAUDI_EVENT_TPC0_KRN_ERR) / 6;
|
return (tpc_dec_event_type - GAUDI_EVENT_TPC0_KRN_ERR) / 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type, u64 *event_mask)
|
||||||
{
|
{
|
||||||
ktime_t zero_time = ktime_set(0, 0);
|
ktime_t zero_time = ktime_set(0, 0);
|
||||||
|
|
||||||
@ -7612,6 +7612,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
|||||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
||||||
|
*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Clock throttling due to overheating\n");
|
"Clock throttling due to overheating\n");
|
||||||
break;
|
break;
|
||||||
@ -7619,6 +7620,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
|||||||
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
||||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
||||||
|
*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||||
dev_info_ratelimited(hdev->dev,
|
dev_info_ratelimited(hdev->dev,
|
||||||
"Thermal envelop is safe, back to optimal clock\n");
|
"Thermal envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
@ -7887,8 +7889,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
||||||
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
gaudi_print_clk_change_info(hdev, event_type, &event_mask);
|
||||||
gaudi_print_clk_change_info(hdev, event_type);
|
|
||||||
hl_fw_unmask_irq(hdev, event_type);
|
hl_fw_unmask_irq(hdev, event_type);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -8603,7 +8603,7 @@ static void gaudi2_handle_hbm_mc_spi(struct hl_device *hdev, u64 intr_cause_data
|
|||||||
hbm_mc_spi[i].cause);
|
hbm_mc_spi[i].cause);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gaudi2_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
static void gaudi2_print_clk_change_info(struct hl_device *hdev, u16 event_type, u64 *event_mask)
|
||||||
{
|
{
|
||||||
ktime_t zero_time = ktime_set(0, 0);
|
ktime_t zero_time = ktime_set(0, 0);
|
||||||
|
|
||||||
@ -8629,12 +8629,14 @@ static void gaudi2_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
|||||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
||||||
|
*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||||
dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
|
dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
|
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
|
||||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
||||||
|
*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
||||||
dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
|
dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -9085,8 +9087,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
|
|||||||
case GAUDI2_EVENT_CPU_FIX_POWER_ENV_E:
|
case GAUDI2_EVENT_CPU_FIX_POWER_ENV_E:
|
||||||
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_S:
|
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_S:
|
||||||
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
|
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
|
||||||
gaudi2_print_clk_change_info(hdev, event_type);
|
gaudi2_print_clk_change_info(hdev, event_type, &event_mask);
|
||||||
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC:
|
case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC:
|
||||||
|
Loading…
Reference in New Issue
Block a user