mirror of
https://github.com/torvalds/linux.git
synced 2024-12-26 21:02:19 +00:00
habanalabs: expand clock throttling information uAPI
In addition to the clock throttling reason, user should be able to obtain also the start time and the duration of the throttling event. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
48f3116983
commit
1679c7ee58
@ -455,6 +455,7 @@ static int device_early_init(struct hl_device *hdev)
|
||||
INIT_LIST_HEAD(&hdev->fpriv_list);
|
||||
mutex_init(&hdev->fpriv_list_lock);
|
||||
atomic_set(&hdev->in_reset, 0);
|
||||
mutex_init(&hdev->clk_throttling.lock);
|
||||
|
||||
return 0;
|
||||
|
||||
@ -495,6 +496,8 @@ static void device_early_fini(struct hl_device *hdev)
|
||||
|
||||
mutex_destroy(&hdev->fpriv_list_lock);
|
||||
|
||||
mutex_destroy(&hdev->clk_throttling.lock);
|
||||
|
||||
hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
|
||||
|
||||
kfree(hdev->hl_chip_info);
|
||||
|
@ -2378,6 +2378,32 @@ struct multi_cs_data {
|
||||
u8 update_ts;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
|
||||
* @start: timestamp taken when 'start' event is received in driver
|
||||
* @end: timestamp taken when 'end' event is received in driver
|
||||
*/
|
||||
struct hl_clk_throttle_timestamp {
|
||||
ktime_t start;
|
||||
ktime_t end;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_clk_throttle - keeps current/last clock throttling timestamps
|
||||
* @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
|
||||
* index 1 refers to THERMAL
|
||||
* @lock: protects this structure as it can be accessed from both event queue
|
||||
* context and info_ioctl context
|
||||
* @current_reason: bitmask represents the current clk throttling reasons
|
||||
* @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
|
||||
*/
|
||||
struct hl_clk_throttle {
|
||||
struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
|
||||
struct mutex lock;
|
||||
u32 current_reason;
|
||||
u32 aggregated_reason;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_device - habanalabs device structure.
|
||||
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
|
||||
@ -2445,6 +2471,7 @@ struct multi_cs_data {
|
||||
* @pci_mem_region: array of memory regions in the PCI
|
||||
* @state_dump_specs: constants and dictionaries needed to dump system state.
|
||||
* @multi_cs_completion: array of multi-CS completion.
|
||||
* @clk_throttling: holds information about current/previous clock throttling events
|
||||
* @dram_used_mem: current DRAM memory consumption.
|
||||
* @timeout_jiffies: device CS timeout value.
|
||||
* @max_power: the max power of the device, as configured by the sysadmin. This
|
||||
@ -2474,7 +2501,6 @@ struct multi_cs_data {
|
||||
* @high_pll: high PLL profile frequency.
|
||||
* @soft_reset_cnt: number of soft reset since the driver was loaded.
|
||||
* @hard_reset_cnt: number of hard reset since the driver was loaded.
|
||||
* @clk_throttling_reason: bitmask represents the current clk throttling reasons
|
||||
* @id: device minor.
|
||||
* @id_control: minor of the control device
|
||||
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
|
||||
@ -2604,6 +2630,8 @@ struct hl_device {
|
||||
|
||||
struct multi_cs_completion multi_cs_completion[
|
||||
MULTI_CS_MAX_USER_CTX];
|
||||
struct hl_clk_throttle clk_throttling;
|
||||
|
||||
u32 *stream_master_qid_arr;
|
||||
atomic64_t dram_used_mem;
|
||||
u64 timeout_jiffies;
|
||||
@ -2622,7 +2650,6 @@ struct hl_device {
|
||||
u32 high_pll;
|
||||
u32 soft_reset_cnt;
|
||||
u32 hard_reset_cnt;
|
||||
u32 clk_throttling_reason;
|
||||
u16 id;
|
||||
u16 id_control;
|
||||
u16 cpu_pci_msb_addr;
|
||||
|
@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
||||
|
||||
static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
|
||||
{
|
||||
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
|
||||
struct hl_device *hdev = hpriv->hdev;
|
||||
struct hl_info_clk_throttle clk_throttle = {0};
|
||||
ktime_t end_time, zero_time = ktime_set(0, 0);
|
||||
u32 max_size = args->return_size;
|
||||
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
|
||||
int i;
|
||||
|
||||
if ((!max_size) || (!out))
|
||||
return -EINVAL;
|
||||
|
||||
clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
|
||||
mutex_lock(&hdev->clk_throttling.lock);
|
||||
|
||||
clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason;
|
||||
|
||||
for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) {
|
||||
if (!(hdev->clk_throttling.aggregated_reason & BIT(i)))
|
||||
continue;
|
||||
|
||||
clk_throttle.clk_throttling_timestamp_us[i] =
|
||||
ktime_to_us(hdev->clk_throttling.timestamp[i].start);
|
||||
|
||||
if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time))
|
||||
end_time = ktime_get();
|
||||
else
|
||||
end_time = hdev->clk_throttling.timestamp[i].end;
|
||||
|
||||
clk_throttle.clk_throttling_duration_ns[i] =
|
||||
ktime_to_ns(ktime_sub(end_time,
|
||||
hdev->clk_throttling.timestamp[i].start));
|
||||
|
||||
}
|
||||
mutex_unlock(&hdev->clk_throttling.lock);
|
||||
|
||||
return copy_to_user(out, &clk_throttle,
|
||||
min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
|
||||
|
@ -7925,27 +7925,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
|
||||
static void gaudi_print_clk_change_info(struct hl_device *hdev,
|
||||
u16 event_type)
|
||||
{
|
||||
ktime_t zero_time = ktime_set(0, 0);
|
||||
|
||||
mutex_lock(&hdev->clk_throttling.lock);
|
||||
|
||||
switch (event_type) {
|
||||
case GAUDI_EVENT_FIX_POWER_ENV_S:
|
||||
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Clock throttling due to power consumption\n");
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_FIX_POWER_ENV_E:
|
||||
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Power envelop is safe, back to optimal clock\n");
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
|
||||
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Clock throttling due to overheating\n");
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
|
||||
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Thermal envelop is safe, back to optimal clock\n");
|
||||
break;
|
||||
@ -7955,6 +7967,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
|
||||
event_type);
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_unlock(&hdev->clk_throttling.lock);
|
||||
}
|
||||
|
||||
static void gaudi_handle_eqe(struct hl_device *hdev,
|
||||
|
@ -4768,24 +4768,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
|
||||
|
||||
static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
||||
{
|
||||
ktime_t zero_time = ktime_set(0, 0);
|
||||
|
||||
mutex_lock(&hdev->clk_throttling.lock);
|
||||
|
||||
switch (event_type) {
|
||||
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
|
||||
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Clock throttling due to power consumption\n");
|
||||
break;
|
||||
|
||||
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
|
||||
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Power envelop is safe, back to optimal clock\n");
|
||||
break;
|
||||
|
||||
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
|
||||
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Clock throttling due to overheating\n");
|
||||
break;
|
||||
|
||||
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
|
||||
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
|
||||
hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
|
||||
dev_info_ratelimited(hdev->dev,
|
||||
"Thermal envelop is safe, back to optimal clock\n");
|
||||
break;
|
||||
@ -4795,6 +4810,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
|
||||
event_type);
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_unlock(&hdev->clk_throttling.lock);
|
||||
}
|
||||
|
||||
void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
|
||||
|
@ -473,15 +473,27 @@ struct hl_info_pci_counters {
|
||||
__u64 replay_cnt;
|
||||
};
|
||||
|
||||
#define HL_CLK_THROTTLE_POWER 0x1
|
||||
#define HL_CLK_THROTTLE_THERMAL 0x2
|
||||
enum hl_clk_throttling_type {
|
||||
HL_CLK_THROTTLE_TYPE_POWER,
|
||||
HL_CLK_THROTTLE_TYPE_THERMAL,
|
||||
HL_CLK_THROTTLE_TYPE_MAX
|
||||
};
|
||||
|
||||
/* clk_throttling_reason masks */
|
||||
#define HL_CLK_THROTTLE_POWER (1 << HL_CLK_THROTTLE_TYPE_POWER)
|
||||
#define HL_CLK_THROTTLE_THERMAL (1 << HL_CLK_THROTTLE_TYPE_THERMAL)
|
||||
|
||||
/**
|
||||
* struct hl_info_clk_throttle - clock throttling reason
|
||||
* @clk_throttling_reason: each bit represents a clk throttling reason
|
||||
* @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event
|
||||
* @clk_throttling_duration_ns: the clock throttle time in nanosec
|
||||
*/
|
||||
struct hl_info_clk_throttle {
|
||||
__u32 clk_throttling_reason;
|
||||
__u32 pad;
|
||||
__u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX];
|
||||
__u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX];
|
||||
};
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user