mirror of
https://github.com/torvalds/linux.git
synced 2024-12-23 19:31:53 +00:00
accel/habanalabs: additional print in device-in-use info
When device release triggers a hard reset, there is a printout of the cause. Currently listed causes (that increment context refcount) are active command submissions and exported DMA buffer objects. In any other case, the printout emits "unknown reason". We identify and print another reason - allocated command buffers. Signed-off-by: Ilia Levi <illevi@habana.ai> Reviewed-by: Ofir Bitton <obitton@habana.ai> Signed-off-by: Ofir Bitton <obitton@habana.ai>
This commit is contained in:
parent
c0af30697c
commit
fda92282b0
@ -549,7 +549,8 @@ int hl_hpriv_put(struct hl_fpriv *hpriv)
|
||||
return kref_put(&hpriv->refcount, hpriv_release);
|
||||
}
|
||||
|
||||
static void print_device_in_use_info(struct hl_device *hdev, const char *message)
|
||||
static void print_device_in_use_info(struct hl_device *hdev,
|
||||
struct hl_mem_mgr_fini_stats *mm_fini_stats, const char *message)
|
||||
{
|
||||
u32 active_cs_num, dmabuf_export_cnt;
|
||||
bool unknown_reason = true;
|
||||
@ -573,6 +574,12 @@ static void print_device_in_use_info(struct hl_device *hdev, const char *message
|
||||
dmabuf_export_cnt);
|
||||
}
|
||||
|
||||
if (mm_fini_stats->n_busy_cb) {
|
||||
unknown_reason = false;
|
||||
offset += scnprintf(buf + offset, size - offset, " [%u live CB handles]",
|
||||
mm_fini_stats->n_busy_cb);
|
||||
}
|
||||
|
||||
if (unknown_reason)
|
||||
scnprintf(buf + offset, size - offset, " [unknown reason]");
|
||||
|
||||
@ -590,6 +597,7 @@ void hl_device_release(struct drm_device *ddev, struct drm_file *file_priv)
|
||||
{
|
||||
struct hl_fpriv *hpriv = file_priv->driver_priv;
|
||||
struct hl_device *hdev = to_hl_device(ddev);
|
||||
struct hl_mem_mgr_fini_stats mm_fini_stats;
|
||||
|
||||
if (!hdev) {
|
||||
pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n");
|
||||
@ -601,12 +609,13 @@ void hl_device_release(struct drm_device *ddev, struct drm_file *file_priv)
|
||||
/* Memory buffers might be still in use at this point and thus the handles IDR destruction
|
||||
* is postponed to hpriv_release().
|
||||
*/
|
||||
hl_mem_mgr_fini(&hpriv->mem_mgr);
|
||||
hl_mem_mgr_fini(&hpriv->mem_mgr, &mm_fini_stats);
|
||||
|
||||
hdev->compute_ctx_in_release = 1;
|
||||
|
||||
if (!hl_hpriv_put(hpriv)) {
|
||||
print_device_in_use_info(hdev, "User process closed FD but device still in use");
|
||||
print_device_in_use_info(hdev, &mm_fini_stats,
|
||||
"User process closed FD but device still in use");
|
||||
hl_device_reset(hdev, HL_DRV_RESET_HARD);
|
||||
}
|
||||
|
||||
@ -976,7 +985,7 @@ static int device_early_init(struct hl_device *hdev)
|
||||
return 0;
|
||||
|
||||
free_cb_mgr:
|
||||
hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
|
||||
hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
|
||||
hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
|
||||
free_chip_info:
|
||||
kfree(hdev->hl_chip_info);
|
||||
@ -1020,7 +1029,7 @@ static void device_early_fini(struct hl_device *hdev)
|
||||
|
||||
mutex_destroy(&hdev->clk_throttling.lock);
|
||||
|
||||
hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
|
||||
hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
|
||||
hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
|
||||
|
||||
kfree(hdev->hl_chip_info);
|
||||
|
@ -904,6 +904,18 @@ struct hl_mem_mgr {
|
||||
struct idr handles;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_mem_mgr_fini_stats - describes statistics returned during memory manager teardown.
|
||||
* @n_busy_cb: the amount of CB handles that could not be removed
|
||||
* @n_busy_ts: the amount of TS handles that could not be removed
|
||||
* @n_busy_other: the amount of any other type of handles that could not be removed
|
||||
*/
|
||||
struct hl_mem_mgr_fini_stats {
|
||||
u32 n_busy_cb;
|
||||
u32 n_busy_ts;
|
||||
u32 n_busy_other;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_mmap_mem_buf_behavior - describes unified memory manager buffer behavior
|
||||
* @topic: string identifier used for logging
|
||||
@ -4036,7 +4048,7 @@ char *hl_format_as_binary(char *buf, size_t buf_len, u32 n);
|
||||
const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
|
||||
|
||||
void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg);
|
||||
void hl_mem_mgr_fini(struct hl_mem_mgr *mmg);
|
||||
void hl_mem_mgr_fini(struct hl_mem_mgr *mmg, struct hl_mem_mgr_fini_stats *stats);
|
||||
void hl_mem_mgr_idr_destroy(struct hl_mem_mgr *mmg);
|
||||
int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
|
||||
void *args);
|
||||
|
@ -263,7 +263,7 @@ int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
|
||||
|
||||
out_err:
|
||||
mutex_unlock(&hdev->fpriv_list_lock);
|
||||
hl_mem_mgr_fini(&hpriv->mem_mgr);
|
||||
hl_mem_mgr_fini(&hpriv->mem_mgr, NULL);
|
||||
hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
|
||||
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
|
||||
mutex_destroy(&hpriv->ctx_lock);
|
||||
|
@ -318,28 +318,61 @@ void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg)
|
||||
idr_init(&mmg->handles);
|
||||
}
|
||||
|
||||
static void hl_mem_mgr_fini_stats_reset(struct hl_mem_mgr_fini_stats *stats)
|
||||
{
|
||||
if (!stats)
|
||||
return;
|
||||
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
}
|
||||
|
||||
static void hl_mem_mgr_fini_stats_inc(u64 mem_id, struct hl_mem_mgr_fini_stats *stats)
|
||||
{
|
||||
if (!stats)
|
||||
return;
|
||||
|
||||
switch (mem_id) {
|
||||
case HL_MMAP_TYPE_CB:
|
||||
++stats->n_busy_cb;
|
||||
break;
|
||||
case HL_MMAP_TYPE_TS_BUFF:
|
||||
++stats->n_busy_ts;
|
||||
break;
|
||||
default:
|
||||
/* we currently store only CB/TS so this shouldn't happen */
|
||||
++stats->n_busy_other;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* hl_mem_mgr_fini - release unified memory manager
|
||||
*
|
||||
* @mmg: parent unified memory manager
|
||||
* @stats: if non-NULL, will return some counters for handles that could not be removed.
|
||||
*
|
||||
* Release the unified memory manager. Shall be called from an interrupt context.
|
||||
*/
|
||||
void hl_mem_mgr_fini(struct hl_mem_mgr *mmg)
|
||||
void hl_mem_mgr_fini(struct hl_mem_mgr *mmg, struct hl_mem_mgr_fini_stats *stats)
|
||||
{
|
||||
struct hl_mmap_mem_buf *buf;
|
||||
struct idr *idp;
|
||||
const char *topic;
|
||||
u64 mem_id;
|
||||
u32 id;
|
||||
|
||||
hl_mem_mgr_fini_stats_reset(stats);
|
||||
|
||||
idp = &mmg->handles;
|
||||
|
||||
idr_for_each_entry(idp, buf, id) {
|
||||
topic = buf->behavior->topic;
|
||||
if (hl_mmap_mem_buf_put(buf) != 1)
|
||||
mem_id = buf->behavior->mem_id;
|
||||
if (hl_mmap_mem_buf_put(buf) != 1) {
|
||||
dev_err(mmg->dev,
|
||||
"%s: Buff handle %u for CTX is still alive\n",
|
||||
topic, id);
|
||||
hl_mem_mgr_fini_stats_inc(mem_id, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user