habanalabs: Use pending CS amount per ASIC

Training schemes requires much more concurrent command submissions than inference does. In addition, training command submissions can be completed in a non serialized manner. Hence, we add support in which each ASIC will be able to configure the amount of concurrent pending command submissions, rather than use a predefined amount. This change will enhance performance by allowing the user to add more concurrent work without waiting for the previous work to be completed. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
2024-12-28 22:02:28 +00:00 · 2020-06-02 12:28:27 +03:00 · 2020-06-02 12:28:27 +03:00 · c16d45f42b
commit c16d45f42b
parent 0b168c8f1d
8 changed files with 37 additions and 10 deletions
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@ -418,7 +418,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	spin_lock(&ctx->cs_lock);
 	cs_cmpl->cs_seq = ctx->cs_sequence;
-	other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
+	other = ctx->cs_pending[cs_cmpl->cs_seq &
 				(hdev->asic_prop.max_pending_cs - 1)];
 	if ((other) && (!dma_fence_is_signaled(other))) {
 		spin_unlock(&ctx->cs_lock);
 		dev_dbg(hdev->dev,
@ -432,7 +433,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->sequence = cs_cmpl->cs_seq;
-	ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+	ctx->cs_pending[cs_cmpl->cs_seq &
 			(hdev->asic_prop.max_pending_cs - 1)] =
 							&cs_cmpl->base_fence;
 	ctx->cs_sequence++;
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	 * to this function unless the ref count is 0
 	 */
-	for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
 		dma_fence_put(ctx->cs_pending[i]);
 	kfree(ctx->cs_pending);
 	if (ctx->asid != HL_KERNEL_ASID_ID) {
 		/* The engines are stopped as there is no executing CS, but the
 		 * Coresight might be still working by accessing addresses
@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	spin_lock_init(&ctx->cs_lock);
 	atomic_set(&ctx->thread_ctx_switch_token, 1);
 	ctx->thread_ctx_switch_wait_token = 0;
 	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
 				sizeof(struct dma_fence *),
 				GFP_KERNEL);
 	if (!ctx->cs_pending)
 		return -ENOMEM;
 	if (is_kernel_ctx) {
 		ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)
 struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
 	struct dma_fence *fence;
 	spin_lock(&ctx->cs_lock);
@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 		return ERR_PTR(-EINVAL);
 	}
-	if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+	if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
 		spin_unlock(&ctx->cs_lock);
 		return NULL;
 	}
 	fence = dma_fence_get(
-			ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
 	spin_unlock(&ctx->cs_lock);
 	return fence;
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 	strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
 					CARD_NAME_MAX_LEN);
 	prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
 	return 0;
 }
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@ -57,6 +57,12 @@
 #define GAUDI_DEFAULT_CARD_NAME		"HL2000"
 #define GAUDI_MAX_PENDING_CS		1024
 #if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
 #error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
 #endif
 #define PCI_DMA_NUMBER_OF_CHNLS		3
 #define HBM_DMA_NUMBER_OF_CHNLS		5
 #define DMA_NUMBER_OF_CHNLS		(PCI_DMA_NUMBER_OF_CHNLS + \
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 	strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
 		CARD_NAME_MAX_LEN);
 	prop->max_pending_cs = GOYA_MAX_PENDING_CS;
 }
 /*
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@ -57,6 +57,12 @@
 #define GOYA_DEFAULT_CARD_NAME		"HL1000"
 #define GOYA_MAX_PENDING_CS		64
 #if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
 #error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
 #endif
 /* DRAM Memory Map */
 #define CPU_FW_IMAGE_SIZE		0x10000000	/* 256MB */
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@ -42,9 +42,6 @@
 #define HL_MAX_QUEUES			128
 /* MUST BE POWER OF 2 and larger than 1 */
 #define HL_MAX_PENDING_CS		64
 #define HL_IDLE_BUSY_TS_ARR_SIZE	4096
 /* Memory */
@ -61,6 +58,9 @@
 #define HL_MAX_SOB_VAL			(1 << 15)
 #define IS_POWER_OF_2(n)		(n != 0 && ((n & (n - 1)) == 0))
 #define IS_MAX_PENDING_CS_VALID(n)	(IS_POWER_OF_2(n) && (n > 1))
 /**
 * struct pgt_info - MMU hop page info.
 * @node: hash linked-list node for the pgts shadow hash of pgts.
@ -285,6 +285,7 @@ struct asic_fixed_properties {
 	u32				high_pll;
 	u32				cb_pool_cb_cnt;
 	u32				cb_pool_cb_size;
 	u32				max_pending_cs;
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 };
@ -782,7 +783,7 @@ struct hl_ctx {
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
-	struct dma_fence	*cs_pending[HL_MAX_PENDING_CS];
+	struct dma_fence	**cs_pending;
 	struct hl_va_range	*host_va_range;
 	struct hl_va_range	*host_huge_va_range;
 	struct hl_va_range	*dram_va_range;
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
 	 * write address offset in the SM block (QMAN LBW message).
 	 * The write address offset is calculated as "COMP_OFFSET << 2".
 	 */
-	offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+	offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
 	ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
 		((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);