mirror of
https://github.com/torvalds/linux.git
synced 2024-12-28 22:02:28 +00:00
habanalabs: Use pending CS amount per ASIC
Training schemes requires much more concurrent command submissions than inference does. In addition, training command submissions can be completed in a non serialized manner. Hence, we add support in which each ASIC will be able to configure the amount of concurrent pending command submissions, rather than use a predefined amount. This change will enhance performance by allowing the user to add more concurrent work without waiting for the previous work to be completed. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
0b168c8f1d
commit
c16d45f42b
@ -418,7 +418,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
|
|||||||
spin_lock(&ctx->cs_lock);
|
spin_lock(&ctx->cs_lock);
|
||||||
|
|
||||||
cs_cmpl->cs_seq = ctx->cs_sequence;
|
cs_cmpl->cs_seq = ctx->cs_sequence;
|
||||||
other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
|
other = ctx->cs_pending[cs_cmpl->cs_seq &
|
||||||
|
(hdev->asic_prop.max_pending_cs - 1)];
|
||||||
if ((other) && (!dma_fence_is_signaled(other))) {
|
if ((other) && (!dma_fence_is_signaled(other))) {
|
||||||
spin_unlock(&ctx->cs_lock);
|
spin_unlock(&ctx->cs_lock);
|
||||||
dev_dbg(hdev->dev,
|
dev_dbg(hdev->dev,
|
||||||
@ -432,7 +433,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
|
|||||||
|
|
||||||
cs->sequence = cs_cmpl->cs_seq;
|
cs->sequence = cs_cmpl->cs_seq;
|
||||||
|
|
||||||
ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
|
ctx->cs_pending[cs_cmpl->cs_seq &
|
||||||
|
(hdev->asic_prop.max_pending_cs - 1)] =
|
||||||
&cs_cmpl->base_fence;
|
&cs_cmpl->base_fence;
|
||||||
ctx->cs_sequence++;
|
ctx->cs_sequence++;
|
||||||
|
|
||||||
|
@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
|
|||||||
* to this function unless the ref count is 0
|
* to this function unless the ref count is 0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
|
for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
|
||||||
dma_fence_put(ctx->cs_pending[i]);
|
dma_fence_put(ctx->cs_pending[i]);
|
||||||
|
|
||||||
|
kfree(ctx->cs_pending);
|
||||||
|
|
||||||
if (ctx->asid != HL_KERNEL_ASID_ID) {
|
if (ctx->asid != HL_KERNEL_ASID_ID) {
|
||||||
/* The engines are stopped as there is no executing CS, but the
|
/* The engines are stopped as there is no executing CS, but the
|
||||||
* Coresight might be still working by accessing addresses
|
* Coresight might be still working by accessing addresses
|
||||||
@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
|
|||||||
spin_lock_init(&ctx->cs_lock);
|
spin_lock_init(&ctx->cs_lock);
|
||||||
atomic_set(&ctx->thread_ctx_switch_token, 1);
|
atomic_set(&ctx->thread_ctx_switch_token, 1);
|
||||||
ctx->thread_ctx_switch_wait_token = 0;
|
ctx->thread_ctx_switch_wait_token = 0;
|
||||||
|
ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
|
||||||
|
sizeof(struct dma_fence *),
|
||||||
|
GFP_KERNEL);
|
||||||
|
if (!ctx->cs_pending)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
if (is_kernel_ctx) {
|
if (is_kernel_ctx) {
|
||||||
ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
|
ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
|
||||||
@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)
|
|||||||
|
|
||||||
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
|
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
|
||||||
{
|
{
|
||||||
|
struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
|
||||||
struct dma_fence *fence;
|
struct dma_fence *fence;
|
||||||
|
|
||||||
spin_lock(&ctx->cs_lock);
|
spin_lock(&ctx->cs_lock);
|
||||||
@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
|
|||||||
return ERR_PTR(-EINVAL);
|
return ERR_PTR(-EINVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
|
if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
|
||||||
spin_unlock(&ctx->cs_lock);
|
spin_unlock(&ctx->cs_lock);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
fence = dma_fence_get(
|
fence = dma_fence_get(
|
||||||
ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
|
ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
|
||||||
spin_unlock(&ctx->cs_lock);
|
spin_unlock(&ctx->cs_lock);
|
||||||
|
|
||||||
return fence;
|
return fence;
|
||||||
|
@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
|
|||||||
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
|
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
|
||||||
CARD_NAME_MAX_LEN);
|
CARD_NAME_MAX_LEN);
|
||||||
|
|
||||||
|
prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,6 +57,12 @@
|
|||||||
|
|
||||||
#define GAUDI_DEFAULT_CARD_NAME "HL2000"
|
#define GAUDI_DEFAULT_CARD_NAME "HL2000"
|
||||||
|
|
||||||
|
#define GAUDI_MAX_PENDING_CS 1024
|
||||||
|
|
||||||
|
#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
|
||||||
|
#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define PCI_DMA_NUMBER_OF_CHNLS 3
|
#define PCI_DMA_NUMBER_OF_CHNLS 3
|
||||||
#define HBM_DMA_NUMBER_OF_CHNLS 5
|
#define HBM_DMA_NUMBER_OF_CHNLS 5
|
||||||
#define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \
|
#define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \
|
||||||
|
@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)
|
|||||||
|
|
||||||
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
|
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
|
||||||
CARD_NAME_MAX_LEN);
|
CARD_NAME_MAX_LEN);
|
||||||
|
|
||||||
|
prop->max_pending_cs = GOYA_MAX_PENDING_CS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -57,6 +57,12 @@
|
|||||||
|
|
||||||
#define GOYA_DEFAULT_CARD_NAME "HL1000"
|
#define GOYA_DEFAULT_CARD_NAME "HL1000"
|
||||||
|
|
||||||
|
#define GOYA_MAX_PENDING_CS 64
|
||||||
|
|
||||||
|
#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
|
||||||
|
#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
|
||||||
|
#endif
|
||||||
|
|
||||||
/* DRAM Memory Map */
|
/* DRAM Memory Map */
|
||||||
|
|
||||||
#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */
|
#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */
|
||||||
|
@ -42,9 +42,6 @@
|
|||||||
|
|
||||||
#define HL_MAX_QUEUES 128
|
#define HL_MAX_QUEUES 128
|
||||||
|
|
||||||
/* MUST BE POWER OF 2 and larger than 1 */
|
|
||||||
#define HL_MAX_PENDING_CS 64
|
|
||||||
|
|
||||||
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096
|
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096
|
||||||
|
|
||||||
/* Memory */
|
/* Memory */
|
||||||
@ -61,6 +58,9 @@
|
|||||||
|
|
||||||
#define HL_MAX_SOB_VAL (1 << 15)
|
#define HL_MAX_SOB_VAL (1 << 15)
|
||||||
|
|
||||||
|
#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0))
|
||||||
|
#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1))
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct pgt_info - MMU hop page info.
|
* struct pgt_info - MMU hop page info.
|
||||||
* @node: hash linked-list node for the pgts shadow hash of pgts.
|
* @node: hash linked-list node for the pgts shadow hash of pgts.
|
||||||
@ -285,6 +285,7 @@ struct asic_fixed_properties {
|
|||||||
u32 high_pll;
|
u32 high_pll;
|
||||||
u32 cb_pool_cb_cnt;
|
u32 cb_pool_cb_cnt;
|
||||||
u32 cb_pool_cb_size;
|
u32 cb_pool_cb_size;
|
||||||
|
u32 max_pending_cs;
|
||||||
u8 tpc_enabled_mask;
|
u8 tpc_enabled_mask;
|
||||||
u8 completion_queues_count;
|
u8 completion_queues_count;
|
||||||
};
|
};
|
||||||
@ -782,7 +783,7 @@ struct hl_ctx {
|
|||||||
struct hl_fpriv *hpriv;
|
struct hl_fpriv *hpriv;
|
||||||
struct hl_device *hdev;
|
struct hl_device *hdev;
|
||||||
struct kref refcount;
|
struct kref refcount;
|
||||||
struct dma_fence *cs_pending[HL_MAX_PENDING_CS];
|
struct dma_fence **cs_pending;
|
||||||
struct hl_va_range *host_va_range;
|
struct hl_va_range *host_va_range;
|
||||||
struct hl_va_range *host_huge_va_range;
|
struct hl_va_range *host_huge_va_range;
|
||||||
struct hl_va_range *dram_va_range;
|
struct hl_va_range *dram_va_range;
|
||||||
|
@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
|
|||||||
* write address offset in the SM block (QMAN LBW message).
|
* write address offset in the SM block (QMAN LBW message).
|
||||||
* The write address offset is calculated as "COMP_OFFSET << 2".
|
* The write address offset is calculated as "COMP_OFFSET << 2".
|
||||||
*/
|
*/
|
||||||
offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
|
offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
|
||||||
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
|
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
|
||||||
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
|
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user