From 0b168c8f1d21f87003fb28b4c87c32335d7fc94b Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 15 Jun 2020 19:25:57 +0300 Subject: [PATCH 01/28] habanalabs: remove rate limiters from GAUDI We no longer need to initialize the rate limiters in GAUDI A1. Reviewed-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 126 +---------------------- drivers/misc/habanalabs/habanalabs_drv.c | 1 - 2 files changed, 1 insertion(+), 126 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 7dd99ec41627..3ee104b7cb8c 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1638,8 +1638,8 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev) uint32_t hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd; hbm0_wr = 0x33333333; - hbm1_wr = 0x33333333; hbm0_rd = 0x77777777; + hbm1_wr = 0x55555555; hbm1_rd = 0xDDDDDDDD; WREG32(mmDMA_IF_E_N_HBM0_WR_CRED_CNT, hbm0_wr); @@ -1689,125 +1689,6 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev) (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); } -static void gaudi_init_rate_limiter(struct hl_device *hdev) -{ - u32 nr, nf, od, sat, rst, timeout; - u64 freq; - - nr = RREG32(mmPSOC_HBM_PLL_NR); - nf = RREG32(mmPSOC_HBM_PLL_NF); - od = RREG32(mmPSOC_HBM_PLL_OD); - freq = (50 * (nf + 1)) / ((nr + 1) * (od + 1)); - - dev_dbg(hdev->dev, "HBM frequency is %lluMHz\n", freq); - - /* Configuration is for five (5) DDMA channels */ - if (freq == 800) { - sat = 4; - rst = 11; - timeout = 15; - } else if (freq == 900) { - sat = 4; - rst = 15; - timeout = 16; - } else if (freq == 950) { - sat = 4; - rst = 15; - timeout = 15; - } else { - dev_warn(hdev->dev, - "unsupported HBM frequency %lluMHz, no rate-limiters\n", - freq); - return; - } - - WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_1, 0x111); - WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_0, 0x111); - WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_1, 0x111); - - if (!hdev->rl_enable) { - dev_info(hdev->dev, "Rate limiters disabled\n"); - return; - } - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_SAT, sat); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_RST, rst); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_EN, 1); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_SAT, sat); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_SAT, sat); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_RST, rst); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_RST, rst); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout); - - WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_EN, 1); - WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_EN, 1); -} - static void gaudi_init_golden_registers(struct hl_device *hdev) { u32 tpc_offset; @@ -1817,8 +1698,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev) gaudi_init_hbm_cred(hdev); - gaudi_init_rate_limiter(hdev); - gaudi_disable_clock_gating(hdev); for (tpc_id = 0, tpc_offset = 0; @@ -1839,9 +1718,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev) WREG32(mmMME1_CTRL_EUS_ROLLUP_CNT_ADD, 3); WREG32(mmMME2_CTRL_EUS_ROLLUP_CNT_ADD, 3); WREG32(mmMME3_CTRL_EUS_ROLLUP_CNT_ADD, 3); - - /* WA for H3-2081 */ - WREG32(mmPCIE_WRAP_MAX_OUTSTAND, 0x10ff); } static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id, diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 8652c7e5d7f1..f38664b03865 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -238,7 +238,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev) hdev->axi_drain = 0; hdev->sram_scrambler_enable = 1; hdev->dram_scrambler_enable = 1; - hdev->rl_enable = 1; hdev->bmc_enable = 1; hdev->hard_reset_on_fw_events = 1; } From c16d45f42b64e91895f4bc1cf19febeb5e0c52b6 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 2 Jun 2020 12:28:27 +0300 Subject: [PATCH 02/28] habanalabs: Use pending CS amount per ASIC Training schemes requires much more concurrent command submissions than inference does. In addition, training command submissions can be completed in a non serialized manner. Hence, we add support in which each ASIC will be able to configure the amount of concurrent pending command submissions, rather than use a predefined amount. This change will enhance performance by allowing the user to add more concurrent work without waiting for the previous work to be completed. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 6 ++++-- drivers/misc/habanalabs/context.c | 14 +++++++++++--- drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++ drivers/misc/habanalabs/gaudi/gaudiP.h | 6 ++++++ drivers/misc/habanalabs/goya/goya.c | 2 ++ drivers/misc/habanalabs/goya/goyaP.h | 6 ++++++ drivers/misc/habanalabs/habanalabs.h | 9 +++++---- drivers/misc/habanalabs/hw_queue.c | 2 +- 8 files changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index b0f62cbbdc87..e99c1d126bd3 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -418,7 +418,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, spin_lock(&ctx->cs_lock); cs_cmpl->cs_seq = ctx->cs_sequence; - other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)]; + other = ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)]; if ((other) && (!dma_fence_is_signaled(other))) { spin_unlock(&ctx->cs_lock); dev_dbg(hdev->dev, @@ -432,7 +433,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->sequence = cs_cmpl->cs_seq; - ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] = + ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)] = &cs_cmpl->base_fence; ctx->cs_sequence++; diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index ec92b3506b1f..1b96fefa4a65 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx) * to this function unless the ref count is 0 */ - for (i = 0 ; i < HL_MAX_PENDING_CS ; i++) + for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++) dma_fence_put(ctx->cs_pending[i]); + kfree(ctx->cs_pending); + if (ctx->asid != HL_KERNEL_ASID_ID) { /* The engines are stopped as there is no executing CS, but the * Coresight might be still working by accessing addresses @@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) spin_lock_init(&ctx->cs_lock); atomic_set(&ctx->thread_ctx_switch_token, 1); ctx->thread_ctx_switch_wait_token = 0; + ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, + sizeof(struct dma_fence *), + GFP_KERNEL); + if (!ctx->cs_pending) + return -ENOMEM; if (is_kernel_ctx) { ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */ @@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx) struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) { + struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; struct dma_fence *fence; spin_lock(&ctx->cs_lock); @@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) return ERR_PTR(-EINVAL); } - if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) { + if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) { spin_unlock(&ctx->cs_lock); return NULL; } fence = dma_fence_get( - ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]); + ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]); spin_unlock(&ctx->cs_lock); return fence; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 3ee104b7cb8c..9d9cbcd5a28a 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + prop->max_pending_cs = GAUDI_MAX_PENDING_CS; + return 0; } diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 41a8d9bff6bf..63baef1e4e99 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -57,6 +57,12 @@ #define GAUDI_DEFAULT_CARD_NAME "HL2000" +#define GAUDI_MAX_PENDING_CS 1024 + +#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS) +#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + #define PCI_DMA_NUMBER_OF_CHNLS 3 #define HBM_DMA_NUMBER_OF_CHNLS 5 #define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \ diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index a4a20e27ed3b..6dccaec95ffb 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev) strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME, CARD_NAME_MAX_LEN); + + prop->max_pending_cs = GOYA_MAX_PENDING_CS; } /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index d36f8d90c9c9..9d8a1761252d 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -57,6 +57,12 @@ #define GOYA_DEFAULT_CARD_NAME "HL1000" +#define GOYA_MAX_PENDING_CS 64 + +#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS) +#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1" +#endif + /* DRAM Memory Map */ #define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 1ecdcf8b763a..64d9b2dd3e19 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -42,9 +42,6 @@ #define HL_MAX_QUEUES 128 -/* MUST BE POWER OF 2 and larger than 1 */ -#define HL_MAX_PENDING_CS 64 - #define HL_IDLE_BUSY_TS_ARR_SIZE 4096 /* Memory */ @@ -61,6 +58,9 @@ #define HL_MAX_SOB_VAL (1 << 15) +#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0)) +#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1)) + /** * struct pgt_info - MMU hop page info. * @node: hash linked-list node for the pgts shadow hash of pgts. @@ -285,6 +285,7 @@ struct asic_fixed_properties { u32 high_pll; u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; + u32 max_pending_cs; u8 tpc_enabled_mask; u8 completion_queues_count; }; @@ -782,7 +783,7 @@ struct hl_ctx { struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; - struct dma_fence *cs_pending[HL_MAX_PENDING_CS]; + struct dma_fence **cs_pending; struct hl_va_range *host_va_range; struct hl_va_range *host_huge_va_range; struct hl_va_range *dram_va_range; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index f4434b39ef1b..29b96d24edc2 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) * write address offset in the SM block (QMAN LBW message). * The write address offset is calculated as "COMP_OFFSET << 2". */ - offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1); + offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1); ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) | ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK); From 21e7a34634495939b195e5dea51e7628c3d7ab3a Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 14 May 2020 18:25:47 +0300 Subject: [PATCH 03/28] habanalabs: sync stream generic functionality Currently sync stream is limited only for external queues. We want to remove this constraint by adding a new queue property dedicated for sync stream. In addition we move the initialization and reset methods to the common code since we can re-use them with slight changes. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 6 ++- drivers/misc/habanalabs/gaudi/gaudi.c | 46 ++----------------- drivers/misc/habanalabs/gaudi/gaudiP.h | 2 - drivers/misc/habanalabs/goya/goya.c | 12 ----- drivers/misc/habanalabs/habanalabs.h | 19 ++++++-- drivers/misc/habanalabs/hw_queue.c | 48 ++++++++++++++++++-- 6 files changed, 67 insertions(+), 66 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index e99c1d126bd3..62dab99dda98 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -740,6 +740,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, struct hl_cs_job *job; struct hl_cs *cs; struct hl_cb *cb; + enum hl_queue_type q_type; u64 *signal_seq_arr = NULL, signal_seq; u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size; int rc; @@ -772,9 +773,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, chunk = &cs_chunk_array[0]; q_idx = chunk->queue_index; hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + q_type = hw_queue_prop->type; if ((q_idx >= HL_MAX_QUEUES) || - (hw_queue_prop->type != QUEUE_TYPE_EXT)) { + (!hw_queue_prop->supports_sync_stream)) { dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); rc = -EINVAL; goto free_cs_chunk_array; @@ -871,7 +873,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, *cs_seq = cs->sequence; - job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + job = hl_cs_allocate_job(hdev, q_type, true); if (!job) { dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 9d9cbcd5a28a..fc377c618af0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -345,10 +345,12 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].requires_kernel_cb = 1; + prop->hw_queues_props[i].supports_sync_stream = 1; } else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) { prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; prop->hw_queues_props[i].driver_only = 1; prop->hw_queues_props[i].requires_kernel_cb = 0; + prop->hw_queues_props[i].supports_sync_stream = 0; } else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) { prop->hw_queues_props[i].type = QUEUE_TYPE_INT; prop->hw_queues_props[i].driver_only = 0; @@ -357,6 +359,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].type = QUEUE_TYPE_NA; prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].requires_kernel_cb = 0; + prop->hw_queues_props[i].supports_sync_stream = 0; } } @@ -364,7 +367,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].type = QUEUE_TYPE_NA; prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; - + prop->sync_stream_first_sob = 0; + prop->sync_stream_first_mon = 0; prop->dram_base_address = DRAM_PHYS_BASE; prop->dram_size = GAUDI_HBM_SIZE_32GB; prop->dram_end_address = prop->dram_base_address + @@ -6296,44 +6300,6 @@ static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) return gaudi_cq_assignment[cq_idx]; } -static void gaudi_ext_queue_init(struct hl_device *hdev, u32 q_idx) -{ - struct gaudi_device *gaudi = hdev->asic_specific; - struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; - struct hl_hw_sob *hw_sob; - int sob, ext_idx = gaudi->ext_queue_idx++; - - /* - * The external queues might not sit sequentially, hence use the - * real external queue index for the SOB/MON base id. - */ - hw_queue->base_sob_id = ext_idx * HL_RSVD_SOBS; - hw_queue->base_mon_id = ext_idx * HL_RSVD_MONS; - hw_queue->next_sob_val = 1; - hw_queue->curr_sob_offset = 0; - - for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { - hw_sob = &hw_queue->hw_sob[sob]; - hw_sob->hdev = hdev; - hw_sob->sob_id = hw_queue->base_sob_id + sob; - hw_sob->q_idx = q_idx; - kref_init(&hw_sob->kref); - } -} - -static void gaudi_ext_queue_reset(struct hl_device *hdev, u32 q_idx) -{ - struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; - - /* - * In case we got here due to a stuck CS, the refcnt might be bigger - * than 1 and therefore we reset it. - */ - kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref); - hw_queue->curr_sob_offset = 0; - hw_queue->next_sob_val = 1; -} - static u32 gaudi_get_signal_cb_size(struct hl_device *hdev) { return sizeof(struct packet_msg_short) + @@ -6636,8 +6602,6 @@ static const struct hl_asic_funcs gaudi_funcs = { .read_device_fw_version = gaudi_read_device_fw_version, .load_firmware_to_device = gaudi_load_firmware_to_device, .load_boot_fit_to_device = gaudi_load_boot_fit_to_device, - .ext_queue_init = gaudi_ext_queue_init, - .ext_queue_reset = gaudi_ext_queue_reset, .get_signal_cb_size = gaudi_get_signal_cb_size, .get_wait_cb_size = gaudi_get_wait_cb_size, .gen_signal_cb = gaudi_gen_signal_cb, diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 63baef1e4e99..3958fe38c8ee 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -234,7 +234,6 @@ struct gaudi_internal_qman_info { * engine. * @multi_msi_mode: whether we are working in multi MSI single MSI mode. * Multi MSI is possible only with IOMMU enabled. - * @ext_queue_idx: helper index for external queues initialization. * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an * 8-bit value so use u8. */ @@ -255,7 +254,6 @@ struct gaudi_device { u32 events_stat_aggregate[GAUDI_EVENT_SIZE]; u32 hw_cap_initialized; u8 multi_msi_mode; - u8 ext_queue_idx; u8 mmu_cache_inv_pi; }; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 6dccaec95ffb..ff9e8a31ced4 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5156,16 +5156,6 @@ u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) return cq_idx; } -static void goya_ext_queue_init(struct hl_device *hdev, u32 q_idx) -{ - -} - -static void goya_ext_queue_reset(struct hl_device *hdev, u32 q_idx) -{ - -} - static u32 goya_get_signal_cb_size(struct hl_device *hdev) { return 0; @@ -5279,8 +5269,6 @@ static const struct hl_asic_funcs goya_funcs = { .read_device_fw_version = goya_read_device_fw_version, .load_firmware_to_device = goya_load_firmware_to_device, .load_boot_fit_to_device = goya_load_boot_fit_to_device, - .ext_queue_init = goya_ext_queue_init, - .ext_queue_reset = goya_ext_queue_reset, .get_signal_cb_size = goya_get_signal_cb_size, .get_wait_cb_size = goya_get_wait_cb_size, .gen_signal_cb = goya_gen_signal_cb, diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 64d9b2dd3e19..8cd4b55d0608 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -50,6 +50,10 @@ /* MMU */ #define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ +/* + * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream + * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream + */ #define HL_RSVD_SOBS 4 #define HL_RSVD_MONS 2 @@ -141,11 +145,13 @@ struct hl_hw_sob { * false otherwise. * @requires_kernel_cb: true if a CB handle must be provided for jobs on this * queue, false otherwise (a CB address must be provided). + * @supports_sync_stream: True if queue supports sync stream */ struct hw_queue_properties { enum hl_queue_type type; u8 driver_only; u8 requires_kernel_cb; + u8 supports_sync_stream; }; /** @@ -245,6 +251,9 @@ struct hl_mmu_properties { * @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool. * @tpc_enabled_mask: which TPCs are enabled. + * @sync_stream_first_sob: first sync object available for sync stream use + * @sync_stream_first_mon: first monitor available for sync stream use + * @tpc_enabled_mask: which TPCs are enabled. * @completion_queues_count: number of completion queues. */ struct asic_fixed_properties { @@ -286,6 +295,8 @@ struct asic_fixed_properties { u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; u32 max_pending_cs; + u16 sync_stream_first_sob; + u16 sync_stream_first_mon; u8 tpc_enabled_mask; u8 completion_queues_count; }; @@ -423,6 +434,7 @@ struct hl_cs_job; * exist). * @curr_sob_offset: the id offset to the currently used SOB from the * HL_RSVD_SOBS that are being used by this queue. + * @supports_sync_stream: True if queue supports sync stream */ struct hl_hw_queue { struct hl_hw_sob hw_sob[HL_RSVD_SOBS]; @@ -441,6 +453,7 @@ struct hl_hw_queue { u16 base_mon_id; u8 valid; u8 curr_sob_offset; + u8 supports_sync_stream; }; /** @@ -603,8 +616,6 @@ enum hl_pll_frequency { * contained in registers * @load_firmware_to_device: load the firmware to the device's memory * @load_boot_fit_to_device: load boot fit to device's memory - * @ext_queue_init: Initialize the given external queue. - * @ext_queue_reset: Reset the given external queue. * @get_signal_cb_size: Get signal CB size. * @get_wait_cb_size: Get wait CB size. * @gen_signal_cb: Generate a signal CB. @@ -707,8 +718,6 @@ struct hl_asic_funcs { enum hl_fw_component fwc); int (*load_firmware_to_device)(struct hl_device *hdev); int (*load_boot_fit_to_device)(struct hl_device *hdev); - void (*ext_queue_init)(struct hl_device *hdev, u32 hw_queue_id); - void (*ext_queue_reset)(struct hl_device *hdev, u32 hw_queue_id); u32 (*get_signal_cb_size)(struct hl_device *hdev); u32 (*get_wait_cb_size)(struct hl_device *hdev); void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id); @@ -1436,6 +1445,7 @@ struct hl_device_idle_busy_ts { * @cdev_sysfs_created: were char devices and sysfs nodes created. * @stop_on_err: true if engines should stop on error. * @supports_sync_stream: is sync stream supported. + * @sync_stream_queue_idx: helper index for sync stream queues initialization. * @supports_coresight: is CoreSight supported. * @supports_soft_reset: is soft reset supported. */ @@ -1523,6 +1533,7 @@ struct hl_device { u8 cdev_sysfs_created; u8 stop_on_err; u8 supports_sync_stream; + u8 sync_stream_queue_idx; u8 supports_coresight; u8 supports_soft_reset; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 29b96d24edc2..27f0c34b63b9 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -663,9 +663,6 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, q->ci = 0; q->pi = 0; - if (!is_cpu_queue) - hdev->asic_funcs->ext_queue_init(hdev, q->hw_queue_id); - return 0; free_queue: @@ -732,6 +729,42 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) return 0; } +static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx) +{ + struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_hw_sob *hw_sob; + int sob, queue_idx = hdev->sync_stream_queue_idx++; + + hw_queue->base_sob_id = + prop->sync_stream_first_sob + queue_idx * HL_RSVD_SOBS; + hw_queue->base_mon_id = + prop->sync_stream_first_mon + queue_idx * HL_RSVD_MONS; + hw_queue->next_sob_val = 1; + hw_queue->curr_sob_offset = 0; + + for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { + hw_sob = &hw_queue->hw_sob[sob]; + hw_sob->hdev = hdev; + hw_sob->sob_id = hw_queue->base_sob_id + sob; + hw_sob->q_idx = q_idx; + kref_init(&hw_sob->kref); + } +} + +static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx) +{ + struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx]; + + /* + * In case we got here due to a stuck CS, the refcnt might be bigger + * than 1 and therefore we reset it. + */ + kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref); + hw_queue->curr_sob_offset = 0; + hw_queue->next_sob_val = 1; +} + /* * queue_init - main initialization function for H/W queue object * @@ -774,6 +807,9 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, break; } + if (q->supports_sync_stream) + sync_stream_queue_init(hdev, q->hw_queue_id); + if (rc) return rc; @@ -848,6 +884,8 @@ int hl_hw_queues_create(struct hl_device *hdev) i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) { q->queue_type = asic->hw_queues_props[i].type; + q->supports_sync_stream = + asic->hw_queues_props[i].supports_sync_stream; rc = queue_init(hdev, q, i); if (rc) { dev_err(hdev->dev, @@ -889,7 +927,7 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) continue; q->pi = q->ci = 0; - if (q->queue_type == QUEUE_TYPE_EXT) - hdev->asic_funcs->ext_queue_reset(hdev, q->hw_queue_id); + if (q->supports_sync_stream) + sync_stream_queue_reset(hdev, q->hw_queue_id); } } From 6c07bab34b07eee62cfdc4a0bcce2732986b4288 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 1 Jun 2020 10:38:46 +0300 Subject: [PATCH 04/28] habanalabs: Use mask instead of shift in sync stream registers Use proper bitfield masks instead of shifting values when configuring packets sent to device. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 70 ++++++++++--------- .../habanalabs/include/gaudi/gaudi_packets.h | 4 +- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index fc377c618af0..0c5d5831f327 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Gaudi security scheme: @@ -6322,16 +6323,17 @@ static void gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id) pkt = (struct packet_msg_short *) (uintptr_t) cb->kernel_address; memset(pkt, 0, sizeof(*pkt)); - value = 1 << GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT; /* inc by 1 */ - value |= 1 << GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT; /* add mode */ + /* Inc by 1, Mode ADD */ + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK, 1); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK, 1); - ctl = (sob_id * 4) << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; /* SOB id */ - ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */ - ctl |= 3 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S SOB base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, sob_id * 4); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6344,12 +6346,12 @@ static u32 gaudi_add_mon_msg_short(struct packet_msg_short *pkt, u32 value, memset(pkt, 0, pkt_size); - ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; - ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; /* only last pkt needs MB */ + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 0); /* last pkt MB */ pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6365,18 +6367,19 @@ static u32 gaudi_add_arm_monitor_pkt(struct packet_msg_short *pkt, u16 sob_id, memset(pkt, 0, pkt_size); - value = (sob_id / 8) << GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_SHIFT; - value |= sob_val << GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_SHIFT; - value |= 0 << GAUDI_PKT_SHORT_VAL_MON_MODE_SHIFT; /* GREATER_OR_EQUAL */ - value |= mask << GAUDI_PKT_SHORT_VAL_MON_MASK_SHIFT; + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_MASK, sob_id / 8); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_MASK, sob_val); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MODE_MASK, + 0); /* GREATER OR EQUAL*/ + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MASK_MASK, mask); - ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */ - ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */ - ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->value = cpu_to_le32(value); pkt->ctl = cpu_to_le32(ctl); @@ -6390,15 +6393,14 @@ static u32 gaudi_add_fence_pkt(struct packet_fence *pkt) memset(pkt, 0, pkt_size); - cfg = 1 << GAUDI_PKT_FENCE_CFG_DEC_VAL_SHIFT; - cfg |= 1 << GAUDI_PKT_FENCE_CFG_TARGET_VAL_SHIFT; - cfg |= 2 << GAUDI_PKT_FENCE_CFG_ID_SHIFT; + cfg = FIELD_PREP(GAUDI_PKT_FENCE_CFG_DEC_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_ID_MASK, 2); - ctl = 0 << GAUDI_PKT_FENCE_CTL_PRED_SHIFT; - ctl |= PACKET_FENCE << GAUDI_PKT_FENCE_CTL_OPCODE_SHIFT; - ctl |= 0 << GAUDI_PKT_FENCE_CTL_EB_SHIFT; - ctl |= 1 << GAUDI_PKT_FENCE_CTL_RB_SHIFT; - ctl |= 1 << GAUDI_PKT_FENCE_CTL_MB_SHIFT; + ctl = FIELD_PREP(GAUDI_PKT_FENCE_CTL_OPCODE_MASK, PACKET_FENCE); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); pkt->cfg = cpu_to_le32(cfg); pkt->ctl = cpu_to_le32(ctl); diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h index 0f0cd067bb43..f30f2c0458d7 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h @@ -85,7 +85,7 @@ struct packet_msg_long { }; #define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT 0 -#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x0000EFFF +#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x00007FFF #define GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT 31 #define GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK 0x80000000 @@ -141,7 +141,7 @@ struct packet_msg_prot { #define GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK 0x00FF0000 #define GAUDI_PKT_FENCE_CFG_ID_SHIFT 30 -#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC000000 +#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC0000000 #define GAUDI_PKT_FENCE_CTL_PRED_SHIFT 0 #define GAUDI_PKT_FENCE_CTL_PRED_MASK 0x0000001F From 3bf1c021e36e7269c71dceafec713b0181f413a9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 9 Jun 2020 16:14:55 +0300 Subject: [PATCH 05/28] uapi/habanalabs: fix some comments MAP/UNMAP are done also for device memory. Reviewed-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f6267a8d7416..f218d1c62c62 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -530,13 +530,13 @@ union hl_wait_cs_args { struct hl_wait_cs_out out; }; -/* Opcode to alloc device memory */ +/* Opcode to allocate device memory */ #define HL_MEM_OP_ALLOC 0 /* Opcode to free previously allocated device memory */ #define HL_MEM_OP_FREE 1 -/* Opcode to map host memory */ +/* Opcode to map host and device memory */ #define HL_MEM_OP_MAP 2 -/* Opcode to unmap previously mapped host memory */ +/* Opcode to unmap previously mapped host and device memory */ #define HL_MEM_OP_UNMAP 3 /* Memory flags */ From 6ced91170df88ff03b7d4c9ec87c185ebf8443be Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 9 Jun 2020 19:58:44 +0300 Subject: [PATCH 06/28] habanalabs: align armcp_packet structure to 8 bytes Once there is a 64-bit field in a structure, GCC compiler for ARM aligns the structure to 8 bytes. In order to avoid confusion when these structures are being passed between CPUs from different architectures, we explicitly align the structure to 8 bytes. Reviewed-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/include/armcp_if.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h index a34fc39ad87e..dea7c90faafa 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/armcp_if.h @@ -276,6 +276,8 @@ struct armcp_packet { /* For get Armcp info/EEPROM data */ __le32 data_max_size; }; + + __le32 reserved; }; struct armcp_unmask_irq_arr_packet { From e8edded6939e4c194ab302d4913cb1a9319561d9 Mon Sep 17 00:00:00 2001 From: Adam Aharon Date: Tue, 26 May 2020 11:04:30 +0300 Subject: [PATCH 07/28] habanalabs: calculate trace frequency from PLL The profiler needs to know the PLL values for correctly showing the profiling data. Because our firmware can use different PLL configurations, we need to read the PLL values from the ASIC to pass them to the profiler. Signed-off-by: Adam Aharon Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 33 ++++- .../misc/habanalabs/gaudi/gaudi_coresight.c | 6 +- drivers/misc/habanalabs/goya/goya.c | 33 ++++- drivers/misc/habanalabs/goya/goya_coresight.c | 6 +- drivers/misc/habanalabs/habanalabs.h | 11 ++ .../include/gaudi/asic_reg/gaudi_regs.h | 1 + .../gaudi/asic_reg/psoc_cpu_pll_regs.h | 114 ++++++++++++++++++ 7 files changed, 194 insertions(+), 10 deletions(-) create mode 100644 drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 0c5d5831f327..aa4139626a04 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -555,11 +555,36 @@ static int gaudi_early_fini(struct hl_device *hdev) static void gaudi_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 trace_freq = 0; + u32 pll_clk = 0; + u32 div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2); + u32 div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2); + u32 nr = RREG32(mmPSOC_CPU_PLL_NR); + u32 nf = RREG32(mmPSOC_CPU_PLL_NF); + u32 od = RREG32(mmPSOC_CPU_PLL_OD); - prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR); - prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF); - prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD); - prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + trace_freq = PLL_REF_CLK; + else + trace_freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + trace_freq = pll_clk; + else + trace_freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", div_sel); + } + + prop->psoc_timestamp_frequency = trace_freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; } static int _gaudi_init_tpc_mem(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c index bf0e062d7b87..c32322cb1728 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c @@ -392,6 +392,7 @@ static int gaudi_config_stm(struct hl_device *hdev, { struct hl_debug_params_stm *input; u64 base_reg; + u32 frequency; int rc; if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) { @@ -420,7 +421,10 @@ static int gaudi_config_stm(struct hl_device *hdev, WREG32(base_reg + 0xE00, lower_32_bits(input->sp_mask)); WREG32(base_reg + 0xEF4, input->id); WREG32(base_reg + 0xDF4, 0x80); - WREG32(base_reg + 0xE8C, input->frequency); + frequency = hdev->asic_prop.psoc_timestamp_frequency; + if (frequency == 0) + frequency = input->frequency; + WREG32(base_reg + 0xE8C, frequency); WREG32(base_reg + 0xE90, 0x7FF); /* SW-2176 - SW WA for HW bug */ diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ff9e8a31ced4..ff32a8fa7624 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -594,11 +594,36 @@ static void goya_qman0_set_security(struct hl_device *hdev, bool secure) static void goya_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 trace_freq = 0; + u32 pll_clk = 0; + u32 div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + u32 div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1); + u32 nr = RREG32(mmPSOC_PCI_PLL_NR); + u32 nf = RREG32(mmPSOC_PCI_PLL_NF); + u32 od = RREG32(mmPSOC_PCI_PLL_OD); - prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR); - prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF); - prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD); - prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + trace_freq = PLL_REF_CLK; + else + trace_freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + trace_freq = pll_clk; + else + trace_freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", div_sel); + } + + prop->psoc_timestamp_frequency = trace_freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; } int goya_late_init(struct hl_device *hdev) diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c index aa51fc71f0a1..18e12e9d284b 100644 --- a/drivers/misc/habanalabs/goya/goya_coresight.c +++ b/drivers/misc/habanalabs/goya/goya_coresight.c @@ -232,6 +232,7 @@ static int goya_config_stm(struct hl_device *hdev, { struct hl_debug_params_stm *input; u64 base_reg; + u32 frequency; int rc; if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) { @@ -264,7 +265,10 @@ static int goya_config_stm(struct hl_device *hdev, WREG32(base_reg + 0xE20, 0xFFFFFFFF); WREG32(base_reg + 0xEF4, input->id); WREG32(base_reg + 0xDF4, 0x80); - WREG32(base_reg + 0xE8C, input->frequency); + frequency = hdev->asic_prop.psoc_timestamp_frequency; + if (frequency == 0) + frequency = input->frequency; + WREG32(base_reg + 0xE8C, frequency); WREG32(base_reg + 0xE90, 0x7FF); WREG32(base_reg + 0xE80, 0x27 | (input->id << 16)); } else { diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 8cd4b55d0608..4e68a41cce77 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -247,6 +247,7 @@ struct hl_mmu_properties { * @psoc_pci_pll_nf: PCI PLL NF value. * @psoc_pci_pll_od: PCI PLL OD value. * @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value. + * @psoc_timestamp_frequency: frequency of the psoc timestamp clock. * @high_pll: high PLL frequency used by the device. * @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool. @@ -291,6 +292,7 @@ struct asic_fixed_properties { u32 psoc_pci_pll_nf; u32 psoc_pci_pll_od; u32 psoc_pci_pll_div_factor; + u32 psoc_timestamp_frequency; u32 high_pll; u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; @@ -533,6 +535,15 @@ enum hl_pll_frequency { PLL_LAST }; +#define PLL_REF_CLK 50 + +enum div_select_defs { + DIV_SEL_REF_CLK = 0, + DIV_SEL_PLL_CLK = 1, + DIV_SEL_DIVIDED_REF = 2, + DIV_SEL_DIVIDED_PLL = 3, +}; + /** * struct hl_asic_funcs - ASIC specific functions that are can be called from * common code. diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h index 85e3b5148595..62078077aee5 100644 --- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h @@ -91,6 +91,7 @@ #include "psoc_pci_pll_regs.h" #include "psoc_hbm_pll_regs.h" +#include "psoc_cpu_pll_regs.h" #define GAUDI_ECC_MEM_SEL_OFFSET 0xF18 #define GAUDI_ECC_ADDRESS_OFFSET 0xF1C diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h new file mode 100644 index 000000000000..2585c70f59ef --- /dev/null +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2018 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +/************************************ + ** This is an auto-generated file ** + ** DO NOT EDIT BELOW ** + ************************************/ + +#ifndef ASIC_REG_PSOC_CPU_PLL_REGS_H_ +#define ASIC_REG_PSOC_CPU_PLL_REGS_H_ + +/* + ***************************************** + * PSOC_CPU_PLL (Prototype: PLL) + ***************************************** + */ + +#define mmPSOC_CPU_PLL_NR 0xC70100 + +#define mmPSOC_CPU_PLL_NF 0xC70104 + +#define mmPSOC_CPU_PLL_OD 0xC70108 + +#define mmPSOC_CPU_PLL_NB 0xC7010C + +#define mmPSOC_CPU_PLL_CFG 0xC70110 + +#define mmPSOC_CPU_PLL_LOSE_MASK 0xC70120 + +#define mmPSOC_CPU_PLL_LOCK_INTR 0xC70128 + +#define mmPSOC_CPU_PLL_LOCK_BYPASS 0xC7012C + +#define mmPSOC_CPU_PLL_DATA_CHNG 0xC70130 + +#define mmPSOC_CPU_PLL_RST 0xC70134 + +#define mmPSOC_CPU_PLL_SLIP_WD_CNTR 0xC70150 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_0 0xC70200 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_1 0xC70204 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_2 0xC70208 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_3 0xC7020C + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_0 0xC70220 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_1 0xC70224 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_2 0xC70228 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_3 0xC7022C + +#define mmPSOC_CPU_PLL_DIV_SEL_0 0xC70280 + +#define mmPSOC_CPU_PLL_DIV_SEL_1 0xC70284 + +#define mmPSOC_CPU_PLL_DIV_SEL_2 0xC70288 + +#define mmPSOC_CPU_PLL_DIV_SEL_3 0xC7028C + +#define mmPSOC_CPU_PLL_DIV_EN_0 0xC702A0 + +#define mmPSOC_CPU_PLL_DIV_EN_1 0xC702A4 + +#define mmPSOC_CPU_PLL_DIV_EN_2 0xC702A8 + +#define mmPSOC_CPU_PLL_DIV_EN_3 0xC702AC + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_0 0xC702C0 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_1 0xC702C4 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_2 0xC702C8 + +#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_3 0xC702CC + +#define mmPSOC_CPU_PLL_CLK_GATER 0xC70300 + +#define mmPSOC_CPU_PLL_CLK_RLX_0 0xC70310 + +#define mmPSOC_CPU_PLL_CLK_RLX_1 0xC70314 + +#define mmPSOC_CPU_PLL_CLK_RLX_2 0xC70318 + +#define mmPSOC_CPU_PLL_CLK_RLX_3 0xC7031C + +#define mmPSOC_CPU_PLL_REF_CNTR_PERIOD 0xC70400 + +#define mmPSOC_CPU_PLL_REF_LOW_THRESHOLD 0xC70410 + +#define mmPSOC_CPU_PLL_REF_HIGH_THRESHOLD 0xC70420 + +#define mmPSOC_CPU_PLL_PLL_NOT_STABLE 0xC70430 + +#define mmPSOC_CPU_PLL_FREQ_CALC_EN 0xC70440 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_CFG 0xC70500 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_0 0xC70510 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_1 0xC70514 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_2 0xC70518 + +#define mmPSOC_CPU_PLL_RLX_BITMAP_3 0xC7051C + +#endif /* ASIC_REG_PSOC_CPU_PLL_REGS_H_ */ From 917b79b09671740ce2e2c702641e50f890b27a67 Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Sat, 4 Jul 2020 22:51:16 +0300 Subject: [PATCH 08/28] habanalabs: rephrase error message Rephrase F/W error message to make it more understandable to ordinary users. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/firmware_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index 15e0793da655..6900c01d060f 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -569,7 +569,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, "Device reports FIT image is corrupted\n"); else dev_err(hdev->dev, - "Device failed to load, %d\n", status); + "Failed to load firmware to device, %d\n", + status); rc = -EIO; goto out; From dd9efabd0a5ae6dab255737c808d5ab6af1e8f91 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 15 Jun 2020 12:09:50 +0300 Subject: [PATCH 09/28] habanalabs: Increase queues depth After recent concurrent cs amount increase, we must also increase queues depth since much more concurrent work can be done. All external queue depths were increased to 4096 as gaudi's internal queue depths were also increased to 1024. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudiP.h | 6 ++--- drivers/misc/habanalabs/habanalabs.h | 31 ++++---------------------- drivers/misc/habanalabs/hw_queue.c | 2 -- drivers/misc/habanalabs/irq.c | 4 ---- 4 files changed, 7 insertions(+), 36 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index 3958fe38c8ee..bdc5f96085a7 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -123,14 +123,14 @@ /* Internal QMANs PQ sizes */ -#define MME_QMAN_LENGTH 64 +#define MME_QMAN_LENGTH 1024 #define MME_QMAN_SIZE_IN_BYTES (MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) -#define HBM_DMA_QMAN_LENGTH 64 +#define HBM_DMA_QMAN_LENGTH 1024 #define HBM_DMA_QMAN_SIZE_IN_BYTES \ (HBM_DMA_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) -#define TPC_QMAN_LENGTH 64 +#define TPC_QMAN_LENGTH 1024 #define TPC_QMAN_SIZE_IN_BYTES (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE) #define SRAM_USER_BASE_OFFSET GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 4e68a41cce77..e4d6f7c91194 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -378,38 +378,15 @@ struct hl_cb { struct hl_cs_job; -/* - * Currently, there are two limitations on the maximum length of a queue: - * - * 1. The memory footprint of the queue. The current allocated space for the - * queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE, - * the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE, - * which currently is 4096/16 = 256 entries. - * - * To increase that, we need either to decrease the size of the - * BD (difficult), or allocate more than a single page (easier). - * - * 2. Because the size of the JOB handle field in the BD CTL / completion queue - * is 10-bit, we can have up to 1024 open jobs per hardware queue. - * Therefore, each queue can hold up to 1024 entries. - * - * HL_QUEUE_LENGTH is in units of struct hl_bd. - * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE - */ - -#define HL_PAGE_SIZE 4096 /* minimum page size */ -/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */ -#define HL_QUEUE_LENGTH 256 +/* Queue length of external and HW queues */ +#define HL_QUEUE_LENGTH 4096 #define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) -/* - * HL_CQ_LENGTH is in units of struct hl_cq_entry. - * HL_CQ_LENGTH should be <= HL_PAGE_SIZE - */ +/* HL_CQ_LENGTH is in units of struct hl_cq_entry */ #define HL_CQ_LENGTH HL_QUEUE_LENGTH #define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) -/* Must be power of 2 (HL_PAGE_SIZE / HL_EQ_ENTRY_SIZE) */ +/* Must be power of 2 */ #define HL_EQ_LENGTH 64 #define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE) diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 27f0c34b63b9..f5a10a5ac300 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -780,8 +780,6 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, { int rc; - BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE); - q->hw_queue_id = hw_queue_id; switch (q->queue_type) { diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index 6981d67153b1..7a4878edb1a3 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -220,8 +220,6 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) { void *p; - BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, &q->bus_address, GFP_KERNEL | __GFP_ZERO); if (!p) @@ -282,8 +280,6 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q) { void *p; - BUILD_BUG_ON(HL_EQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, HL_EQ_SIZE_IN_BYTES, &q->bus_address); From 0eab4f89d64c1639a1b9288f316bbfa292bd9413 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 22 Jun 2020 09:52:22 +0300 Subject: [PATCH 10/28] habanalabs: rephrase error messages rephrase some error/warning/notice messages to make them more accessible to ordinary users. There is no need to print context ASID as the driver currently doesn't support multiple contexts. Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/command_submission.c | 20 +++++++++++++------- drivers/misc/habanalabs/context.c | 3 +-- drivers/misc/habanalabs/firmware_if.c | 4 ++-- drivers/misc/habanalabs/memory.c | 3 +-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 62dab99dda98..f81d6685e011 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -373,9 +373,9 @@ static void cs_timedout(struct work_struct *work) hdev = cs->ctx->hdev; ctx_asid = cs->ctx->asid; - /* TODO: add information about last signaled seq and last emitted seq */ - dev_err(hdev->dev, "User %d command submission %llu got stuck!\n", - ctx_asid, cs->sequence); + dev_err(hdev->dev, + "Command submission %llu has not finished in time!\n", + cs->sequence); cs_put(cs); @@ -1130,7 +1130,7 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, rc = PTR_ERR(fence); if (rc == -EINVAL) dev_notice_ratelimited(hdev->dev, - "Can't wait on seq %llu because current CS is at seq %llu\n", + "Can't wait on CS %llu because current CS is at seq %llu\n", seq, ctx->cs_sequence); } else if (fence) { rc = dma_fence_wait_timeout(fence, true, timeout); @@ -1163,15 +1163,21 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) memset(args, 0, sizeof(*args)); if (rc < 0) { - dev_err_ratelimited(hdev->dev, - "Error %ld on waiting for CS handle %llu\n", - rc, seq); if (rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for CS handle %llu\n", + seq); args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; rc = -EINTR; } else if (rc == -ETIMEDOUT) { + dev_err_ratelimited(hdev->dev, + "CS %llu has timed-out while user process is waiting for it\n", + seq); args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT; } else if (rc == -EIO) { + dev_err_ratelimited(hdev->dev, + "CS %llu has been aborted while user process is waiting for it\n", + seq); args->out.status = HL_WAIT_CS_STATUS_ABORTED; } return rc; diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index 1b96fefa4a65..1e3e5b19ecd9 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -112,8 +112,7 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx) return; dev_warn(hdev->dev, - "Context %d closed or terminated but its CS are executing\n", - ctx->asid); + "user process released device but its command submissions are still executing\n"); } int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index 6900c01d060f..9e7f203a09d7 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -289,7 +289,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev) HL_ARMCP_INFO_TIMEOUT_USEC, &result); if (rc) { dev_err(hdev->dev, - "Failed to send ArmCP info pkt, error %d\n", rc); + "Failed to handle ArmCP info pkt, error %d\n", rc); goto out; } @@ -340,7 +340,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) if (rc) { dev_err(hdev->dev, - "Failed to send ArmCP EEPROM packet, error %d\n", rc); + "Failed to handle ArmCP EEPROM packet, error %d\n", rc); goto out; } diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 47da84a17719..e4e1693e5c6c 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -1730,8 +1730,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) */ if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash)) dev_notice(hdev->dev, - "ctx %d is freed while it has va in use\n", - ctx->asid); + "user released device without removing its memory mappings\n"); hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) { dev_dbg(hdev->dev, From c8f9b49d2db7e98df534c6e666b26c64da4cb667 Mon Sep 17 00:00:00 2001 From: Christine Gharzuzi Date: Tue, 23 Jun 2020 19:21:22 +0300 Subject: [PATCH 11/28] habanalabs: extract cpu boot status lookup Extract detection of the cpu boot status to a function to allow code reuse Signed-off-by: Christine Gharzuzi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/firmware_if.c | 92 ++++++++++++++------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index 9e7f203a09d7..3be1549cd137 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -393,6 +393,53 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg) "Device boot error - NIC F/W initialization failed\n"); } +static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status) +{ + switch (status) { + case CPU_BOOT_STATUS_NA: + dev_err(hdev->dev, + "Device boot error - BTL did NOT run\n"); + break; + case CPU_BOOT_STATUS_IN_WFE: + dev_err(hdev->dev, + "Device boot error - Stuck inside WFE loop\n"); + break; + case CPU_BOOT_STATUS_IN_BTL: + dev_err(hdev->dev, + "Device boot error - Stuck in BTL\n"); + break; + case CPU_BOOT_STATUS_IN_PREBOOT: + dev_err(hdev->dev, + "Device boot error - Stuck in Preboot\n"); + break; + case CPU_BOOT_STATUS_IN_SPL: + dev_err(hdev->dev, + "Device boot error - Stuck in SPL\n"); + break; + case CPU_BOOT_STATUS_IN_UBOOT: + dev_err(hdev->dev, + "Device boot error - Stuck in u-boot\n"); + break; + case CPU_BOOT_STATUS_DRAM_INIT_FAIL: + dev_err(hdev->dev, + "Device boot error - DRAM initialization failed\n"); + break; + case CPU_BOOT_STATUS_UBOOT_NOT_READY: + dev_err(hdev->dev, + "Device boot error - u-boot stopped by user\n"); + break; + case CPU_BOOT_STATUS_TS_INIT_FAIL: + dev_err(hdev->dev, + "Device boot error - Thermal Sensor initialization failed\n"); + break; + default: + dev_err(hdev->dev, + "Device boot error - Invalid status code %d\n", + status); + break; + } +} + int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, u32 msg_to_cpu_reg, u32 cpu_msg_status_reg, u32 boot_err0_reg, bool skip_bmc, @@ -466,50 +513,7 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, * versions but we keep them here for backward compatibility */ if (rc) { - switch (status) { - case CPU_BOOT_STATUS_NA: - dev_err(hdev->dev, - "Device boot error - BTL did NOT run\n"); - break; - case CPU_BOOT_STATUS_IN_WFE: - dev_err(hdev->dev, - "Device boot error - Stuck inside WFE loop\n"); - break; - case CPU_BOOT_STATUS_IN_BTL: - dev_err(hdev->dev, - "Device boot error - Stuck in BTL\n"); - break; - case CPU_BOOT_STATUS_IN_PREBOOT: - dev_err(hdev->dev, - "Device boot error - Stuck in Preboot\n"); - break; - case CPU_BOOT_STATUS_IN_SPL: - dev_err(hdev->dev, - "Device boot error - Stuck in SPL\n"); - break; - case CPU_BOOT_STATUS_IN_UBOOT: - dev_err(hdev->dev, - "Device boot error - Stuck in u-boot\n"); - break; - case CPU_BOOT_STATUS_DRAM_INIT_FAIL: - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - break; - case CPU_BOOT_STATUS_UBOOT_NOT_READY: - dev_err(hdev->dev, - "Device boot error - u-boot stopped by user\n"); - break; - case CPU_BOOT_STATUS_TS_INIT_FAIL: - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - break; - default: - dev_err(hdev->dev, - "Device boot error - Invalid status code %d\n", - status); - break; - } - + hl_detect_cpu_boot_status(hdev, status); rc = -EIO; goto out; } From db491e4f08a9fd84ebb1ebd22a6b0b988a81a0d8 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 18 Jun 2020 09:51:16 +0300 Subject: [PATCH 12/28] habanalabs: Add dropped cs statistics info struct Add command submission statistics structure which can be obtained through the info ioctl. Each drop counter describes the reason for which the command submission was dropped. This information is needed for the user to be aware of the specific reason for which the submitted work was dropped. The user can then utilize the driver more efficiently. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 24 +++++++++++++++++++- drivers/misc/habanalabs/habanalabs.h | 5 ++++ drivers/misc/habanalabs/habanalabs_ioctl.c | 24 ++++++++++++++++++++ drivers/misc/habanalabs/hw_queue.c | 5 +++- include/uapi/misc/habanalabs.h | 21 +++++++++++++++++ 5 files changed, 77 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index f81d6685e011..777f88d25acd 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -246,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) kfree(job); } +static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx) +{ + hdev->aggregated_cs_counters.device_in_reset_drop_cnt += + ctx->cs_counters.device_in_reset_drop_cnt; + hdev->aggregated_cs_counters.out_of_mem_drop_cnt += + ctx->cs_counters.out_of_mem_drop_cnt; + hdev->aggregated_cs_counters.parsing_drop_cnt += + ctx->cs_counters.parsing_drop_cnt; + hdev->aggregated_cs_counters.queue_full_drop_cnt += + ctx->cs_counters.queue_full_drop_cnt; +} + static void cs_do_release(struct kref *ref) { struct hl_cs *cs = container_of(ref, struct hl_cs, @@ -349,6 +361,8 @@ static void cs_do_release(struct kref *ref) dma_fence_signal(cs->fence); dma_fence_put(cs->fence); + cs_counters_aggregate(hdev, cs->ctx); + kfree(cs); } @@ -632,12 +646,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = validate_queue_index(hdev, chunk, &queue_type, &is_kernel_allocated_cb); - if (rc) + if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; goto free_cs_object; + } if (is_kernel_allocated_cb) { cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk); if (!cb) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; rc = -EINVAL; goto free_cs_object; } @@ -651,6 +668,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, job = hl_cs_allocate_job(hdev, queue_type, is_kernel_allocated_cb); if (!job) { + hpriv->ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; if (is_kernel_allocated_cb) @@ -683,6 +701,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = cs_parser(hpriv, job); if (rc) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", cs->ctx->asid, cs->sequence, job->id, rc); @@ -691,6 +710,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, } if (int_queues_only) { + hpriv->ctx->cs_counters.parsing_drop_cnt++; dev_err(hdev->dev, "Reject CS %d.%llu because only internal queues jobs are present\n", cs->ctx->asid, cs->sequence); @@ -875,6 +895,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, job = hl_cs_allocate_job(hdev, q_type, true); if (!job) { + ctx->cs_counters.out_of_mem_drop_cnt++; dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; goto put_cs; @@ -882,6 +903,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, cb = hl_cb_kernel_create(hdev, PAGE_SIZE); if (!cb) { + ctx->cs_counters.out_of_mem_drop_cnt++; kfree(job); rc = -EFAULT; goto put_cs; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e4d6f7c91194..ae781453a509 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -10,6 +10,7 @@ #include "include/armcp_if.h" #include "include/qman_if.h" +#include #include #include @@ -787,6 +788,7 @@ struct hl_ctx { struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct hl_cs_counters cs_counters; u64 cs_sequence; u64 *dram_default_hops; spinlock_t cs_lock; @@ -1391,6 +1393,7 @@ struct hl_device_idle_busy_ts { * @compute_ctx: current compute context executing. * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy * and vice-versa + * @aggregated_cs_counters: aggregated cs counters among all contexts * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -1489,6 +1492,8 @@ struct hl_device { struct hl_device_idle_busy_ts *idle_busy_ts_arr; + struct hl_cs_counters aggregated_cs_counters; + atomic64_t dram_used_mem; u64 timeout_jiffies; u64 max_power; diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c index 52eedd3a6c3a..5af1c03da473 100644 --- a/drivers/misc/habanalabs/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/habanalabs_ioctl.c @@ -276,6 +276,27 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; } +static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_cs_counters cs_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters, + sizeof(struct hl_cs_counters)); + + if (hpriv->ctx) + memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters, + sizeof(struct hl_cs_counters)); + + return copy_to_user(out, &cs_counters, + min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -336,6 +357,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_TIME_SYNC: return time_sync_info(hdev, args); + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index f5a10a5ac300..da66ffb528f8 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -514,6 +514,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) hdev->asic_funcs->hw_queues_lock(hdev); if (hl_device_disabled_or_in_reset(hdev)) { + ctx->cs_counters.device_in_reset_drop_cnt++; dev_err(hdev->dev, "device is disabled or in reset, CS rejected!\n"); rc = -EPERM; @@ -543,8 +544,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) break; } - if (rc) + if (rc) { + ctx->cs_counters.queue_full_drop_cnt++; goto unroll_cq_resv; + } if (q->queue_type == QUEUE_TYPE_EXT || q->queue_type == QUEUE_TYPE_HW) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f218d1c62c62..d5c4f983b7a8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -263,6 +263,7 @@ enum hl_device_status { * time the driver was loaded. * HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time * for synchronization. + * HL_INFO_CS_COUNTERS - Retrieve command submission counters */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -274,6 +275,7 @@ enum hl_device_status { #define HL_INFO_CLK_RATE 8 #define HL_INFO_RESET_COUNT 9 #define HL_INFO_TIME_SYNC 10 +#define HL_INFO_CS_COUNTERS 11 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -338,6 +340,25 @@ struct hl_info_time_sync { __u64 host_time; }; +/** + * struct hl_info_cs_counters - command submission counters + * @out_of_mem_drop_cnt: dropped due to memory allocation issue + * @parsing_drop_cnt: dropped due to error in packet parsing + * @queue_full_drop_cnt: dropped due to queue full + * @device_in_reset_drop_cnt: dropped due to device in reset + */ +struct hl_cs_counters { + __u64 out_of_mem_drop_cnt; + __u64 parsing_drop_cnt; + __u64 queue_full_drop_cnt; + __u64 device_in_reset_drop_cnt; +}; + +struct hl_info_cs_counters { + struct hl_cs_counters cs_counters; + struct hl_cs_counters ctx_cs_counters; +}; + struct hl_info_args { /* Location of relevant struct in userspace */ __u64 return_pointer; From fcc6a4e606787be775b032f96c57472592f76300 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 17 May 2020 08:20:35 +0300 Subject: [PATCH 13/28] habanalabs: Extract ECC information from FW ECC (Error Correcting Code) interrupts are going to be handled by the FW. Hence, we define an interface in which the driver can obtain the relevant ECC information. This information is needed for monitoring and can also lead to a hard reset if ECC error is not correctable. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 366 ++++++------------ drivers/misc/habanalabs/include/armcp_if.h | 12 +- .../include/gaudi/asic_reg/gaudi_regs.h | 19 +- 3 files changed, 146 insertions(+), 251 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index aa4139626a04..888f42adee6a 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -316,6 +316,13 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = { QUEUE_TYPE_NA, /* GAUDI_QUEUE_ID_NIC_9_3 */ }; +struct ecc_info_extract_params { + u64 block_address; + u32 num_memories; + bool derr; + bool disable_clock_gating; +}; + static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, u64 phys_addr); static int gaudi_send_job_on_qman0(struct hl_device *hdev, @@ -5117,62 +5124,75 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev) * | |0xF4C memory wrappers 127:96 | * +-------------------+------------------------------------------------------+ */ -static void gaudi_print_ecc_info_generic(struct hl_device *hdev, - const char *block_name, - u64 block_address, int num_memories, - bool derr, bool disable_clock_gating) +static int gaudi_extract_ecc_info(struct hl_device *hdev, + struct ecc_info_extract_params *params, u64 *ecc_address, + u64 *ecc_syndrom, u8 *memory_wrapper_idx) { struct gaudi_device *gaudi = hdev->asic_specific; - int num_mem_regs = num_memories / 32 + ((num_memories % 32) ? 1 : 0); + u32 i, num_mem_regs, reg, err_bit; + u64 err_addr, err_word = 0; + int rc = 0; - if (block_address >= CFG_BASE) - block_address -= CFG_BASE; + num_mem_regs = params->num_memories / 32 + + ((params->num_memories % 32) ? 1 : 0); - if (derr) - block_address += GAUDI_ECC_DERR0_OFFSET; + if (params->block_address >= CFG_BASE) + params->block_address -= CFG_BASE; + + if (params->derr) + err_addr = params->block_address + GAUDI_ECC_DERR0_OFFSET; else - block_address += GAUDI_ECC_SERR0_OFFSET; + err_addr = params->block_address + GAUDI_ECC_SERR0_OFFSET; - if (disable_clock_gating) { + if (params->disable_clock_gating) { mutex_lock(&gaudi->clk_gate_mutex); hdev->asic_funcs->disable_clock_gating(hdev); } - switch (num_mem_regs) { - case 1: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x\n", - block_name, RREG32(block_address)); - break; - case 2: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4)); - break; - case 3: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4), - RREG32(block_address + 8)); - break; - case 4: - dev_err(hdev->dev, - "%s ECC indication: 0x%08x 0x%08x 0x%08x 0x%08x\n", - block_name, - RREG32(block_address), RREG32(block_address + 4), - RREG32(block_address + 8), RREG32(block_address + 0xc)); - break; - default: - break; + /* Set invalid wrapper index */ + *memory_wrapper_idx = 0xFF; + /* Iterate through memory wrappers, a single bit must be set */ + for (i = 0 ; i > num_mem_regs ; i++) { + err_addr += i * 4; + err_word = RREG32(err_addr); + if (err_word) { + err_bit = __ffs(err_word); + *memory_wrapper_idx = err_bit + (32 * i); + break; + } } - if (disable_clock_gating) { + if (*memory_wrapper_idx == 0xFF) { + dev_err(hdev->dev, "ECC error information cannot be found\n"); + rc = -EINVAL; + goto enable_clk_gate; + } + + WREG32(params->block_address + GAUDI_ECC_MEM_SEL_OFFSET, + *memory_wrapper_idx); + + *ecc_address = + RREG32(params->block_address + GAUDI_ECC_ADDRESS_OFFSET); + *ecc_syndrom = + RREG32(params->block_address + GAUDI_ECC_SYNDROME_OFFSET); + + /* Clear error indication */ + reg = RREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET); + if (params->derr) + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_DERR_MASK, 1); + else + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_SERR_MASK, 1); + + WREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET, reg); + +enable_clk_gate: + if (params->disable_clock_gating) { hdev->asic_funcs->enable_clock_gating(hdev); mutex_unlock(&gaudi->clk_gate_mutex); } + + return rc; } static void gaudi_handle_qman_err_generic(struct hl_device *hdev, @@ -5225,239 +5245,99 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev, } } -static void gaudi_print_ecc_info(struct hl_device *hdev, u16 event_type) +static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type, + struct hl_eq_ecc_data *ecc_data) { - u64 block_address; - u8 index; - int num_memories; - char desc[32]; - bool derr; - bool disable_clock_gating; + struct ecc_info_extract_params params; + u64 ecc_address = 0, ecc_syndrom = 0; + u8 index, memory_wrapper_idx = 0; + bool extract_info_from_fw; + int rc; switch (event_type) { - case GAUDI_EVENT_PCIE_CORE_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE"); - block_address = mmPCIE_CORE_BASE; - num_memories = 51; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_CORE_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE"); - block_address = mmPCIE_CORE_BASE; - num_memories = 51; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_IF_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP"); - block_address = mmPCIE_WRAP_BASE; - num_memories = 11; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_IF_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP"); - block_address = mmPCIE_WRAP_BASE; - num_memories = 11; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_PHY_SERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY"); - block_address = mmPCIE_PHY_BASE; - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PCIE_PHY_DERR: - snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY"); - block_address = mmPCIE_PHY_BASE; - num_memories = 4; - derr = true; - disable_clock_gating = false; + case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR: + case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR: + extract_info_from_fw = true; break; case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR: index = event_type - GAUDI_EVENT_TPC0_SERR; - block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index); - num_memories = 90; - derr = false; - disable_clock_gating = true; + params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; + params.num_memories = 90; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR: index = event_type - GAUDI_EVENT_TPC0_DERR; - block_address = + params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index); - num_memories = 90; - derr = true; - disable_clock_gating = true; + params.num_memories = 90; + params.derr = true; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_ACC_SERR: case GAUDI_EVENT_MME1_ACC_SERR: case GAUDI_EVENT_MME2_ACC_SERR: case GAUDI_EVENT_MME3_ACC_SERR: index = (event_type - GAUDI_EVENT_MME0_ACC_SERR) / 4; - block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index); - num_memories = 128; - derr = false; - disable_clock_gating = true; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_ACC_DERR: case GAUDI_EVENT_MME1_ACC_DERR: case GAUDI_EVENT_MME2_ACC_DERR: case GAUDI_EVENT_MME3_ACC_DERR: index = (event_type - GAUDI_EVENT_MME0_ACC_DERR) / 4; - block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index); - num_memories = 128; - derr = true; - disable_clock_gating = true; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = true; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_SBAB_SERR: case GAUDI_EVENT_MME1_SBAB_SERR: case GAUDI_EVENT_MME2_SBAB_SERR: case GAUDI_EVENT_MME3_SBAB_SERR: index = (event_type - GAUDI_EVENT_MME0_SBAB_SERR) / 4; - block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index); - num_memories = 33; - derr = false; - disable_clock_gating = true; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = false; + params.disable_clock_gating = true; + extract_info_from_fw = false; break; case GAUDI_EVENT_MME0_SBAB_DERR: case GAUDI_EVENT_MME1_SBAB_DERR: case GAUDI_EVENT_MME2_SBAB_DERR: case GAUDI_EVENT_MME3_SBAB_DERR: index = (event_type - GAUDI_EVENT_MME0_SBAB_DERR) / 4; - block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index); - num_memories = 33; - derr = true; - disable_clock_gating = true; - break; - case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC: - index = event_type - GAUDI_EVENT_DMA0_SERR_ECC; - block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index); - num_memories = 16; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC: - index = event_type - GAUDI_EVENT_DMA0_DERR_ECC; - block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index); - num_memories = 16; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_CPU_IF_ECC_SERR: - block_address = mmCPU_IF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_CPU_IF_ECC_DERR: - block_address = mmCPU_IF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_MEM_SERR: - block_address = mmPSOC_GLOBAL_CONF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_MEM_DERR: - block_address = mmPSOC_GLOBAL_CONF_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 4; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_CORESIGHT_SERR: - block_address = mmPSOC_CS_TRACE_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 2; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_PSOC_CORESIGHT_DERR: - block_address = mmPSOC_CS_TRACE_BASE; - snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU"); - num_memories = 2; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR: - index = event_type - GAUDI_EVENT_SRAM0_SERR; - block_address = - mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index); - num_memories = 2; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR: - index = event_type - GAUDI_EVENT_SRAM0_DERR; - block_address = - mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET; - snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index); - num_memories = 2; - derr = true; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR: - index = event_type - GAUDI_EVENT_DMA_IF0_SERR; - block_address = mmDMA_IF_W_S_BASE + - index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE); - snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index); - num_memories = 60; - derr = false; - disable_clock_gating = false; - break; - case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR: - index = event_type - GAUDI_EVENT_DMA_IF0_DERR; - block_address = mmDMA_IF_W_S_BASE + - index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE); - snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index); - derr = true; - num_memories = 60; - disable_clock_gating = false; - break; - case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: - index = event_type - GAUDI_EVENT_HBM_0_SERR; - /* HBM Registers are at different offsets */ - block_address = mmHBM0_BASE + 0x8000 + - index * (mmHBM1_BASE - mmHBM0_BASE); - snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index); - derr = false; - num_memories = 64; - disable_clock_gating = false; - break; - case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: - index = event_type - GAUDI_EVENT_HBM_0_SERR; - /* HBM Registers are at different offsets */ - block_address = mmHBM0_BASE + 0x8000 + - index * (mmHBM1_BASE - mmHBM0_BASE); - snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index); - derr = true; - num_memories = 64; - disable_clock_gating = false; - break; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = true; + params.disable_clock_gating = true; default: return; } - gaudi_print_ecc_info_generic(hdev, desc, block_address, num_memories, - derr, disable_clock_gating); + if (extract_info_from_fw) { + ecc_address = le64_to_cpu(ecc_data->ecc_address); + ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); + memory_wrapper_idx = ecc_data->memory_wrapper_idx; + } else { + rc = gaudi_extract_ecc_info(hdev, ¶ms, &ecc_address, + &ecc_syndrom, &memory_wrapper_idx); + if (rc) + return; + } + + dev_err(hdev->dev, + "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n", + ecc_address, ecc_syndrom, memory_wrapper_idx); } static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type) @@ -5507,8 +5387,6 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); - gaudi_print_ecc_info(hdev, event_type); - if (razwi) { gaudi_print_razwi_info(hdev); gaudi_print_mmu_error_info(hdev); @@ -5738,10 +5616,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_PSOC_CORESIGHT_DERR: case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR: case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR: - fallthrough; - case GAUDI_EVENT_GIC500: case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: case GAUDI_EVENT_MMU_DERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + break; + + case GAUDI_EVENT_GIC500: case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: @@ -5837,6 +5720,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: fallthrough; case GAUDI_EVENT_MMU_SERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + hl_fw_unmask_irq(hdev, event_type); + break; + case GAUDI_EVENT_PCIE_DEC: case GAUDI_EVENT_MME0_WBC_RSP: case GAUDI_EVENT_MME0_SBAB0_RSP: diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h index dea7c90faafa..07f9972db28d 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/armcp_if.h @@ -19,9 +19,19 @@ struct hl_eq_header { __le32 ctl; }; +struct hl_eq_ecc_data { + __le64 ecc_address; + __le64 ecc_syndrom; + __u8 memory_wrapper_idx; + __u8 pad[7]; +}; + struct hl_eq_entry { struct hl_eq_header hdr; - __le64 data[7]; + union { + struct hl_eq_ecc_data ecc_data; + __le64 data[7]; + }; }; #define HL_EQ_ENTRY_SIZE sizeof(struct hl_eq_entry) diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h index 62078077aee5..0c75d43532bd 100644 --- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h @@ -93,17 +93,14 @@ #include "psoc_hbm_pll_regs.h" #include "psoc_cpu_pll_regs.h" -#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18 -#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C -#define GAUDI_ECC_SYNDROME_OFFSET 0xF20 -#define GAUDI_ECC_SERR0_OFFSET 0xF30 -#define GAUDI_ECC_SERR1_OFFSET 0xF34 -#define GAUDI_ECC_SERR2_OFFSET 0xF38 -#define GAUDI_ECC_SERR3_OFFSET 0xF3C -#define GAUDI_ECC_DERR0_OFFSET 0xF40 -#define GAUDI_ECC_DERR1_OFFSET 0xF44 -#define GAUDI_ECC_DERR2_OFFSET 0xF48 -#define GAUDI_ECC_DERR3_OFFSET 0xF4C +#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18 +#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C +#define GAUDI_ECC_SYNDROME_OFFSET 0xF20 +#define GAUDI_ECC_MEM_INFO_CLR_OFFSET 0xF28 +#define GAUDI_ECC_MEM_INFO_CLR_SERR_MASK BIT(8) +#define GAUDI_ECC_MEM_INFO_CLR_DERR_MASK BIT(9) +#define GAUDI_ECC_SERR0_OFFSET 0xF30 +#define GAUDI_ECC_DERR0_OFFSET 0xF40 #define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 0x492000 #define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0 0x494000 From f4cbfd2445ffa3415023899e43ac9c8334171cf4 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 15 Jun 2020 17:45:12 +0300 Subject: [PATCH 14/28] habanalabs: PCIe iATU refactoring Divide iATU initialization into inbound/outbound methods. We must separate it in order to enable different match mode per PCIe region. In addition, added support for PCI address match mode. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 61 +++++++----- drivers/misc/habanalabs/goya/goya.c | 37 ++++++- drivers/misc/habanalabs/habanalabs.h | 52 +++++++++- drivers/misc/habanalabs/pci.c | 136 ++++++++++++-------------- 4 files changed, 181 insertions(+), 105 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 888f42adee6a..a6e40dec3cd2 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -465,6 +465,7 @@ static int gaudi_pci_bars_map(struct hl_device *hdev) static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) { struct gaudi_device *gaudi = hdev->asic_specific; + struct hl_inbound_pci_region pci_region; u64 old_addr = addr; int rc; @@ -472,7 +473,10 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) return old_addr; /* Inbound Region 2 - Bar 4 - Point to HBM */ - rc = hl_pci_set_dram_bar_base(hdev, 2, 4, addr); + pci_region.mode = PCI_BAR_MATCH_MODE; + pci_region.bar = HBM_BAR_ID; + pci_region.addr = addr; + rc = hl_pci_set_inbound_region(hdev, 2, &pci_region); if (rc) return U64_MAX; @@ -486,22 +490,43 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) static int gaudi_init_iatu(struct hl_device *hdev) { - int rc = 0; + struct hl_inbound_pci_region inbound_region; + struct hl_outbound_pci_region outbound_region; + int rc; + + /* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = SRAM_BAR_ID; + inbound_region.addr = SRAM_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region); + if (rc) + goto done; /* Inbound Region 1 - Bar 2 - Point to SPI FLASH */ - rc = hl_pci_iatu_write(hdev, 0x314, - lower_32_bits(SPI_FLASH_BASE_ADDR)); - rc |= hl_pci_iatu_write(hdev, 0x318, - upper_32_bits(SPI_FLASH_BASE_ADDR)); - rc |= hl_pci_iatu_write(hdev, 0x300, 0); - /* Enable + Bar match + match enable */ - rc |= hl_pci_iatu_write(hdev, 0x304, 0xC0080200); - + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = CFG_BAR_ID; + inbound_region.addr = SPI_FLASH_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region); if (rc) - return -EIO; + goto done; - return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE, - HOST_PHYS_BASE, HOST_PHYS_SIZE); + /* Inbound Region 2 - Bar 4 - Point to HBM */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = HBM_BAR_ID; + inbound_region.addr = DRAM_PHYS_BASE; + rc = hl_pci_set_inbound_region(hdev, 2, &inbound_region); + if (rc) + goto done; + + hdev->asic_funcs->set_dma_mask_from_fw(hdev); + + /* Outbound Region 0 - Point to Host */ + outbound_region.addr = HOST_PHYS_BASE; + outbound_region.size = HOST_PHYS_SIZE; + rc = hl_pci_set_outbound_region(hdev, &outbound_region); + +done: + return rc; } static int gaudi_early_init(struct hl_device *hdev) @@ -2884,16 +2909,6 @@ static int gaudi_hw_init(struct hl_device *hdev) gaudi_init_hbm_dma_qmans(hdev); - /* - * Before pushing u-boot/linux to device, need to set the hbm bar to - * base address of dram - */ - if (gaudi_set_hbm_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) { - dev_err(hdev->dev, - "failed to map HBM bar to DRAM base address\n"); - return -EIO; - } - rc = gaudi_init_cpu(hdev); if (rc) { dev_err(hdev->dev, "failed to initialize CPU\n"); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ff32a8fa7624..5839b5bc9ee3 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -458,6 +458,7 @@ static int goya_pci_bars_map(struct hl_device *hdev) static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) { struct goya_device *goya = hdev->asic_specific; + struct hl_inbound_pci_region pci_region; u64 old_addr = addr; int rc; @@ -465,7 +466,10 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) return old_addr; /* Inbound Region 1 - Bar 4 - Point to DDR */ - rc = hl_pci_set_dram_bar_base(hdev, 1, 4, addr); + pci_region.mode = PCI_BAR_MATCH_MODE; + pci_region.bar = DDR_BAR_ID; + pci_region.addr = addr; + rc = hl_pci_set_inbound_region(hdev, 1, &pci_region); if (rc) return U64_MAX; @@ -487,8 +491,35 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) */ static int goya_init_iatu(struct hl_device *hdev) { - return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE, - HOST_PHYS_BASE, HOST_PHYS_SIZE); + struct hl_inbound_pci_region inbound_region; + struct hl_outbound_pci_region outbound_region; + int rc; + + /* Inbound Region 0 - Bar 0 - Point to SRAM and CFG */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = SRAM_CFG_BAR_ID; + inbound_region.addr = SRAM_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region); + if (rc) + goto done; + + /* Inbound Region 1 - Bar 4 - Point to DDR */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = DDR_BAR_ID; + inbound_region.addr = DRAM_PHYS_BASE; + rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region); + if (rc) + goto done; + + hdev->asic_funcs->set_dma_mask_from_fw(hdev); + + /* Outbound Region 0 - Point to Host */ + outbound_region.addr = HOST_PHYS_BASE; + outbound_region.size = HOST_PHYS_SIZE; + rc = hl_pci_set_outbound_region(hdev, &outbound_region); + +done: + return rc; } /* diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index ae781453a509..365236589bbf 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -66,6 +66,8 @@ #define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0)) #define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1)) +#define HL_PCI_NUM_BARS 6 + /** * struct pgt_info - MMU hop page info. * @node: hash linked-list node for the pgts shadow hash of pgts. @@ -90,6 +92,16 @@ struct pgt_info { struct hl_device; struct hl_fpriv; +/** + * enum hl_pci_match_mode - pci match mode per region + * @PCI_ADDRESS_MATCH_MODE: address match mode + * @PCI_BAR_MATCH_MODE: bar match mode + */ +enum hl_pci_match_mode { + PCI_ADDRESS_MATCH_MODE, + PCI_BAR_MATCH_MODE +}; + /** * enum hl_fw_component - F/W components to read version through registers. * @FW_COMP_UBOOT: u-boot. @@ -125,6 +137,32 @@ enum hl_cs_type { CS_TYPE_WAIT }; +/* + * struct hl_inbound_pci_region - inbound region descriptor + * @mode: pci match mode for this region + * @addr: region target address + * @size: region size in bytes + * @offset_in_bar: offset within bar (address match mode) + * @bar: bar id + */ +struct hl_inbound_pci_region { + enum hl_pci_match_mode mode; + u64 addr; + u64 size; + u64 offset_in_bar; + u8 bar; +}; + +/* + * struct hl_outbound_pci_region - outbound region descriptor + * @addr: region target address + * @size: region size in bytes + */ +struct hl_outbound_pci_region { + u64 addr; + u64 size; +}; + /* * struct hl_hw_sob - H/W SOB info. * @hdev: habanalabs device structure. @@ -1347,7 +1385,9 @@ struct hl_device_idle_busy_ts { /** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. - * @pcie_bar: array of available PCIe bars. + * @pcie_bar_phys: array of available PCIe bars physical addresses. + * (required only for PCI address match mode) + * @pcie_bar: array of available PCIe bars virtual addresses. * @rmmio: configuration area address on SRAM. * @cdev: related char device. * @cdev_ctrl: char device for control operations only (INFO IOCTL) @@ -1442,7 +1482,8 @@ struct hl_device_idle_busy_ts { */ struct hl_device { struct pci_dev *pdev; - void __iomem *pcie_bar[6]; + u64 pcie_bar_phys[HL_PCI_NUM_BARS]; + void __iomem *pcie_bar[HL_PCI_NUM_BARS]; void __iomem *rmmio; struct cdev cdev; struct cdev cdev_ctrl; @@ -1767,9 +1808,10 @@ int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data); int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, u64 addr); -int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_base_address, - u64 host_phys_size); +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region); +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region); int hl_pci_init(struct hl_device *hdev); void hl_pci_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/pci.c index 61a8bb07262c..1791f6623c69 100644 --- a/drivers/misc/habanalabs/pci.c +++ b/drivers/misc/habanalabs/pci.c @@ -9,9 +9,15 @@ #include "include/hw_ip/pci/pci_general.h" #include +#include #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC (HL_PCI_ELBI_TIMEOUT_MSEC * 10) +#define IATU_REGION_CTRL_REGION_EN_MASK BIT(31) +#define IATU_REGION_CTRL_MATCH_MODE_MASK BIT(30) +#define IATU_REGION_CTRL_NUM_MATCH_EN_MASK BIT(19) +#define IATU_REGION_CTRL_BAR_NUM_MASK GENMASK(10, 8) + /** * hl_pci_bars_map() - Map PCI BARs. * @hdev: Pointer to hl_device structure. @@ -187,110 +193,94 @@ static void hl_pci_reset_link_through_bridge(struct hl_device *hdev) } /** - * hl_pci_set_dram_bar_base() - Set DDR BAR to map specific device address. + * hl_pci_set_inbound_region() - Configure inbound region * @hdev: Pointer to hl_device structure. - * @inbound_region: Inbound region number. - * @bar: PCI BAR number. - * @addr: Address in DRAM. Must be aligned to DRAM bar size. + * @region: Inbound region number. + * @pci_region: Inbound region parameters. * - * Configure the iATU so that the DRAM bar will start at the specified address. + * Configure the iATU inbound region. * * Return: 0 on success, negative value for failure. */ -int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, - u64 addr) +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u32 offset; - int rc; + u64 bar_phys_base, region_base, region_end_address; + u32 offset, ctrl_reg_val; + int rc = 0; - switch (inbound_region) { - case 0: - offset = 0x100; - break; - case 1: - offset = 0x300; - break; - case 2: - offset = 0x500; - break; - default: - dev_err(hdev->dev, "Invalid inbound region %d\n", - inbound_region); - return -EINVAL; - } + /* region offset */ + offset = (0x200 * region) + 0x100; - if (bar != 0 && bar != 2 && bar != 4) { - dev_err(hdev->dev, "Invalid PCI BAR %d\n", bar); - return -EINVAL; + if (pci_region->mode == PCI_ADDRESS_MATCH_MODE) { + bar_phys_base = hdev->pcie_bar_phys[pci_region->bar]; + region_base = bar_phys_base + pci_region->offset_in_bar; + region_end_address = region_base + pci_region->size - 1; + + rc |= hl_pci_iatu_write(hdev, offset + 0x8, + lower_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0xC, + upper_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0x10, + lower_32_bits(region_end_address)); } /* Point to the specified address */ - rc = hl_pci_iatu_write(hdev, offset + 0x14, lower_32_bits(addr)); - rc |= hl_pci_iatu_write(hdev, offset + 0x18, upper_32_bits(addr)); + rc = hl_pci_iatu_write(hdev, offset + 0x14, + lower_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, offset + 0x18, + upper_32_bits(pci_region->addr)); rc |= hl_pci_iatu_write(hdev, offset + 0x0, 0); - /* Enable + BAR match + match enable + BAR number */ - rc |= hl_pci_iatu_write(hdev, offset + 0x4, 0xC0080000 | (bar << 8)); + + /* Enable + bar/address match + match enable + bar number */ + ctrl_reg_val = FIELD_PREP(IATU_REGION_CTRL_REGION_EN_MASK, 1); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_MATCH_MODE_MASK, + pci_region->mode); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_NUM_MATCH_EN_MASK, 1); + + if (pci_region->mode == PCI_BAR_MATCH_MODE) + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_BAR_NUM_MASK, + pci_region->bar); + + rc |= hl_pci_iatu_write(hdev, offset + 0x4, ctrl_reg_val); /* Return the DBI window to the default location */ rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); if (rc) - dev_err(hdev->dev, "failed to map DRAM bar to 0x%08llx\n", - addr); + dev_err(hdev->dev, "failed to map bar %u to 0x%08llx\n", + pci_region->bar, pci_region->addr); return rc; } /** - * hl_pci_init_iatu() - Initialize the iATU unit inside the PCI controller. + * hl_pci_set_outbound_region() - Configure outbound region 0 * @hdev: Pointer to hl_device structure. - * @sram_base_address: SRAM base address. - * @dram_base_address: DRAM base address. - * @host_phys_base_address: Base physical address of host memory for device - * transactions. - * @host_phys_size: Size of host memory for device transactions. + * @pci_region: Outbound region parameters. * - * This is needed in case the firmware doesn't initialize the iATU. + * Configure the iATU outbound region 0. * * Return: 0 on success, negative value for failure. */ -int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_base_address, - u64 host_phys_size) +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u64 host_phys_end_addr; + u64 outbound_region_end_address; int rc = 0; - /* Inbound Region 0 - Bar 0 - Point to SRAM base address */ - rc = hl_pci_iatu_write(hdev, 0x114, lower_32_bits(sram_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x118, upper_32_bits(sram_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x100, 0); - /* Enable + Bar match + match enable */ - rc |= hl_pci_iatu_write(hdev, 0x104, 0xC0080000); - - /* Return the DBI window to the default location */ - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); - - hdev->asic_funcs->set_dma_mask_from_fw(hdev); - - /* Point to DRAM */ - if (!hdev->asic_funcs->set_dram_bar_base) - return -EINVAL; - if (hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address) == - U64_MAX) - return -EIO; - - /* Outbound Region 0 - Point to Host */ - host_phys_end_addr = host_phys_base_address + host_phys_size - 1; + /* Outbound Region 0 */ + outbound_region_end_address = + pci_region->addr + pci_region->size - 1; rc |= hl_pci_iatu_write(hdev, 0x008, - lower_32_bits(host_phys_base_address)); + lower_32_bits(pci_region->addr)); rc |= hl_pci_iatu_write(hdev, 0x00C, - upper_32_bits(host_phys_base_address)); - rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr)); + upper_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, 0x010, + lower_32_bits(outbound_region_end_address)); rc |= hl_pci_iatu_write(hdev, 0x014, 0); if ((hdev->power9_64bit_dma_enable) && (hdev->dma_mask == 64)) @@ -298,7 +288,8 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, else rc |= hl_pci_iatu_write(hdev, 0x018, 0); - rc |= hl_pci_iatu_write(hdev, 0x020, upper_32_bits(host_phys_end_addr)); + rc |= hl_pci_iatu_write(hdev, 0x020, + upper_32_bits(outbound_region_end_address)); /* Increase region size */ rc |= hl_pci_iatu_write(hdev, 0x000, 0x00002000); /* Enable */ @@ -308,10 +299,7 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); - if (rc) - return -EIO; - - return 0; + return rc; } /** From 12ae3133d2df4d5091cbf6a966a98f5e07c31b56 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Fri, 3 Jul 2020 20:58:23 +0300 Subject: [PATCH 15/28] habanalabs: remove soft-reset support from GAUDI Soft-reset isn't supported in GAUDI. Remove the code that performs it and print error in case the user wants to do it via sysfs. Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/gaudi/gaudi.c | 95 ++++++++++----------------- 1 file changed, 34 insertions(+), 61 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index a6e40dec3cd2..eede6c33a37f 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -75,7 +75,6 @@ #define GAUDI_PLDM_RESET_WAIT_MSEC 1000 /* 1s */ #define GAUDI_PLDM_HRESET_TIMEOUT_MSEC 20000 /* 20s */ -#define GAUDI_PLDM_SRESET_TIMEOUT_MSEC 14000 /* 14s */ #define GAUDI_PLDM_TEST_QUEUE_WAIT_USEC 1000000 /* 1s */ #define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100) #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) @@ -2587,16 +2586,14 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; } - if (hard_reset) { - /* - * I don't know what is the state of the CPU so make sure it is - * stopped in any means necessary - */ - WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GAUDI_EVENT_HALT_MACHINE); - msleep(cpu_timeout_ms); - } + /* + * I don't know what is the state of the CPU so make sure it is + * stopped in any means necessary + */ + WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GAUDI_EVENT_HALT_MACHINE); + msleep(cpu_timeout_ms); gaudi_stop_mme_qmans(hdev); gaudi_stop_tpc_qmans(hdev); @@ -2621,10 +2618,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) gaudi_disable_timestamp(hdev); - if (hard_reset) - gaudi_disable_msi(hdev); - else - gaudi_sync_irqs(hdev); + gaudi_disable_msi(hdev); } static int gaudi_mmu_init(struct hl_device *hdev) @@ -2969,46 +2963,37 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) struct gaudi_device *gaudi = hdev->asic_specific; u32 status, reset_timeout_ms, boot_strap = 0; - if (hdev->pldm) { - if (hard_reset) - reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; - else - reset_timeout_ms = GAUDI_PLDM_SRESET_TIMEOUT_MSEC; - } else { + if (!hard_reset) { + dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); + return; + } + + if (hdev->pldm) + reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; + else reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; - } - if (hard_reset) { - /* Tell ASIC not to re-initialize PCIe */ - WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); + /* Tell ASIC not to re-initialize PCIe */ + WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); - boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); - /* H/W bug WA: - * rdata[31:0] = strap_read_val; - * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0] - */ - boot_strap = (((boot_strap & 0x7FE00000) << 1) | - (boot_strap & 0x001FFFFF)); - WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2); + boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); - /* Restart BTL/BLR upon hard-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1); + /* H/W bug WA: + * rdata[31:0] = strap_read_val; + * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0] + */ + boot_strap = (((boot_strap & 0x7FE00000) << 1) | + (boot_strap & 0x001FFFFF)); + WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2); - WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, - 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); - dev_info(hdev->dev, - "Issued HARD reset command, going to wait %dms\n", - reset_timeout_ms); - } else { - /* Don't restart BTL/BLR upon soft-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 0); + /* Restart BTL/BLR upon hard-reset */ + WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1); - WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST, - 1 << PSOC_GLOBAL_CONF_SOFT_RST_IND_SHIFT); - dev_info(hdev->dev, - "Issued SOFT reset command, going to wait %dms\n", - reset_timeout_ms); - } + WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, + 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); + dev_info(hdev->dev, + "Issued HARD reset command, going to wait %dms\n", + reset_timeout_ms); /* * After hard reset, we can't poll the BTM_FSM register because the PSOC @@ -3022,18 +3007,6 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) "Timeout while waiting for device to reset 0x%x\n", status); - if (!hard_reset) { - gaudi->hw_cap_initialized &= ~(HW_CAP_PCI_DMA | HW_CAP_MME | - HW_CAP_TPC_MASK | - HW_CAP_HBM_DMA); - - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GAUDI_EVENT_SOFT_RESET); - return; - } - - /* We continue here only for hard-reset */ - WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap); gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | From 3abc99bb7dcbc0704972dae6c6ba92fbb1fbf191 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 23 Jun 2020 14:50:39 +0300 Subject: [PATCH 16/28] habanalabs: configure maximum queues per asic Currently the amount of maximum queues is statically configured. Using a static value is causing redundunt cycles when traversing all queues and consumes more memory than actually needed. In this patch we configure each asic with the exact number of queues needed. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 24 +++++++++++--- drivers/misc/habanalabs/gaudi/gaudi.c | 33 +++++++++++-------- drivers/misc/habanalabs/goya/goya.c | 34 +++++++++++++++----- drivers/misc/habanalabs/goya/goyaP.h | 6 +--- drivers/misc/habanalabs/habanalabs.h | 10 +++--- drivers/misc/habanalabs/hw_queue.c | 19 +++++++---- 6 files changed, 83 insertions(+), 43 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 777f88d25acd..7769a1aacca1 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -363,6 +363,7 @@ static void cs_do_release(struct kref *ref) cs_counters_aggregate(hdev, cs->ctx); + kfree(cs->jobs_in_queue_cnt); kfree(cs); } @@ -435,13 +436,19 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, other = ctx->cs_pending[cs_cmpl->cs_seq & (hdev->asic_prop.max_pending_cs - 1)]; if ((other) && (!dma_fence_is_signaled(other))) { - spin_unlock(&ctx->cs_lock); dev_dbg(hdev->dev, "Rejecting CS because of too many in-flights CS\n"); rc = -EAGAIN; goto free_fence; } + cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues, + sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC); + if (!cs->jobs_in_queue_cnt) { + rc = -ENOMEM; + goto free_fence; + } + dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock, ctx->asid, ctx->cs_sequence); @@ -463,6 +470,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, return 0; free_fence: + spin_unlock(&ctx->cs_lock); kfree(cs_cmpl); free_cs: kfree(cs); @@ -515,10 +523,18 @@ static int validate_queue_index(struct hl_device *hdev, struct asic_fixed_properties *asic = &hdev->asic_prop; struct hw_queue_properties *hw_queue_prop; + /* This must be checked here to prevent out-of-bounds access to + * hw_queues_props array + */ + if (chunk->queue_index >= asic->max_queues) { + dev_err(hdev->dev, "Queue index %d is invalid\n", + chunk->queue_index); + return -EINVAL; + } + hw_queue_prop = &asic->hw_queues_props[chunk->queue_index]; - if ((chunk->queue_index >= HL_MAX_QUEUES) || - (hw_queue_prop->type == QUEUE_TYPE_NA)) { + if (hw_queue_prop->type == QUEUE_TYPE_NA) { dev_err(hdev->dev, "Queue index %d is invalid\n", chunk->queue_index); return -EINVAL; @@ -795,7 +811,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; q_type = hw_queue_prop->type; - if ((q_idx >= HL_MAX_QUEUES) || + if ((q_idx >= hdev->asic_prop.max_queues) || (!hw_queue_prop->supports_sync_stream)) { dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); rc = -EINVAL; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index eede6c33a37f..7eee4a10154b 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -340,14 +340,15 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) struct asic_fixed_properties *prop = &hdev->asic_prop; int i; - if (GAUDI_QUEUE_ID_SIZE >= HL_MAX_QUEUES) { - dev_err(hdev->dev, - "Number of H/W queues must be smaller than %d\n", - HL_MAX_QUEUES); - return -EFAULT; - } + prop->max_queues = GAUDI_QUEUE_ID_SIZE; + prop->hw_queues_props = kcalloc(prop->max_queues, + sizeof(struct hw_queue_properties), + GFP_KERNEL); - for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) { + if (!prop->hw_queues_props) + return -ENOMEM; + + for (i = 0 ; i < prop->max_queues ; i++) { if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) { prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].driver_only = 0; @@ -370,9 +371,6 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) } } - for (; i < HL_MAX_QUEUES; i++) - prop->hw_queues_props[i].type = QUEUE_TYPE_NA; - prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; prop->sync_stream_first_sob = 0; prop->sync_stream_first_mon = 0; @@ -548,7 +546,8 @@ static int gaudi_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, SRAM_BAR_ID), SRAM_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } if (pci_resource_len(pdev, CFG_BAR_ID) != CFG_BAR_SIZE) { @@ -558,20 +557,26 @@ static int gaudi_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, CFG_BAR_ID), CFG_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID); rc = hl_pci_init(hdev); if (rc) - return rc; + goto free_queue_props; return 0; + +free_queue_props: + kfree(hdev->asic_prop.hw_queues_props); + return rc; } static int gaudi_early_fini(struct hl_device *hdev) { + kfree(hdev->asic_prop.hw_queues_props); hl_pci_fini(hdev); return 0; @@ -3461,7 +3466,7 @@ static int gaudi_test_queues(struct hl_device *hdev) { int i, rc, ret_val = 0; - for (i = 0 ; i < HL_MAX_QUEUES ; i++) { + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++) { if (hdev->asic_prop.hw_queues_props[i].type == QUEUE_TYPE_EXT) { rc = gaudi_test_queue(hdev, i); if (rc) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5839b5bc9ee3..36db771f391c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -337,11 +337,19 @@ static int goya_mmu_set_dram_default_page(struct hl_device *hdev); static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev); static void goya_mmu_prepare(struct hl_device *hdev, u32 asid); -void goya_get_fixed_properties(struct hl_device *hdev) +int goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; int i; + prop->max_queues = GOYA_QUEUE_ID_SIZE; + prop->hw_queues_props = kcalloc(prop->max_queues, + sizeof(struct hw_queue_properties), + GFP_KERNEL); + + if (!prop->hw_queues_props) + return -ENOMEM; + for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].driver_only = 0; @@ -361,9 +369,6 @@ void goya_get_fixed_properties(struct hl_device *hdev) prop->hw_queues_props[i].requires_kernel_cb = 0; } - for (; i < HL_MAX_QUEUES; i++) - prop->hw_queues_props[i].type = QUEUE_TYPE_NA; - prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; prop->dram_base_address = DRAM_PHYS_BASE; @@ -428,6 +433,8 @@ void goya_get_fixed_properties(struct hl_device *hdev) CARD_NAME_MAX_LEN); prop->max_pending_cs = GOYA_MAX_PENDING_CS; + + return 0; } /* @@ -540,7 +547,11 @@ static int goya_early_init(struct hl_device *hdev) u32 val; int rc; - goya_get_fixed_properties(hdev); + rc = goya_get_fixed_properties(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to get fixed properties\n"); + return rc; + } /* Check BAR sizes */ if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) { @@ -550,7 +561,8 @@ static int goya_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, SRAM_CFG_BAR_ID), CFG_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) { @@ -560,14 +572,15 @@ static int goya_early_init(struct hl_device *hdev) (unsigned long long) pci_resource_len(pdev, MSIX_BAR_ID), MSIX_BAR_SIZE); - return -ENODEV; + rc = -ENODEV; + goto free_queue_props; } prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID); rc = hl_pci_init(hdev); if (rc) - return rc; + goto free_queue_props; if (!hdev->pldm) { val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); @@ -577,6 +590,10 @@ static int goya_early_init(struct hl_device *hdev) } return 0; + +free_queue_props: + kfree(hdev->asic_prop.hw_queues_props); + return rc; } /* @@ -589,6 +606,7 @@ static int goya_early_init(struct hl_device *hdev) */ static int goya_early_fini(struct hl_device *hdev) { + kfree(hdev->asic_prop.hw_queues_props); hl_pci_fini(hdev); return 0; diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 9d8a1761252d..8265cc21b45a 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -31,10 +31,6 @@ */ #define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + 1) -#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES) -#error "Number of H/W queues must be smaller than HL_MAX_QUEUES" -#endif - #if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES) #error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES" #endif @@ -170,7 +166,7 @@ struct goya_device { u8 device_cpu_mmu_mappings_done; }; -void goya_get_fixed_properties(struct hl_device *hdev); +int goya_get_fixed_properties(struct hl_device *hdev); int goya_mmu_init(struct hl_device *hdev); void goya_init_dma_qmans(struct hl_device *hdev); void goya_init_mme_qmans(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 365236589bbf..9213d107b533 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -41,8 +41,6 @@ #define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */ -#define HL_MAX_QUEUES 128 - #define HL_IDLE_BUSY_TS_ARR_SIZE 4096 /* Memory */ @@ -290,14 +288,15 @@ struct hl_mmu_properties { * @high_pll: high PLL frequency used by the device. * @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool. - * @tpc_enabled_mask: which TPCs are enabled. + * @max_pending_cs: maximum of concurrent pending command submissions + * @max_queues: maximum amount of queues in the system * @sync_stream_first_sob: first sync object available for sync stream use * @sync_stream_first_mon: first monitor available for sync stream use * @tpc_enabled_mask: which TPCs are enabled. * @completion_queues_count: number of completion queues. */ struct asic_fixed_properties { - struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES]; + struct hw_queue_properties *hw_queues_props; struct armcp_info armcp_info; char uboot_ver[VERSION_MAX_LEN]; char preboot_ver[VERSION_MAX_LEN]; @@ -336,6 +335,7 @@ struct asic_fixed_properties { u32 cb_pool_cb_cnt; u32 cb_pool_cb_size; u32 max_pending_cs; + u32 max_queues; u16 sync_stream_first_sob; u16 sync_stream_first_mon; u8 tpc_enabled_mask; @@ -901,7 +901,7 @@ struct hl_userptr { * @aborted: true if CS was aborted due to some device error. */ struct hl_cs { - u16 jobs_in_queue_cnt[HL_MAX_QUEUES]; + u16 *jobs_in_queue_cnt; struct hl_ctx *ctx; struct list_head job_list; spinlock_t job_lock; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index da66ffb528f8..7965551587fc 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -46,7 +46,7 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) goto out; q = &hdev->kernel_queues[0]; - for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) { + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { if (q->queue_type == QUEUE_TYPE_INT) { q->ci += cs->jobs_in_queue_cnt[i]; q->ci &= ((q->int_queue_len << 1) - 1); @@ -509,6 +509,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) struct hl_device *hdev = ctx->hdev; struct hl_cs_job *job, *tmp; struct hl_hw_queue *q; + u32 max_queues; int rc = 0, i, cq_cnt; hdev->asic_funcs->hw_queues_lock(hdev); @@ -521,8 +522,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) goto out; } + max_queues = hdev->asic_prop.max_queues; + q = &hdev->kernel_queues[0]; - for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) { + for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) { if (cs->jobs_in_queue_cnt[i]) { switch (q->queue_type) { case QUEUE_TYPE_EXT: @@ -601,7 +604,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) unroll_cq_resv: q = &hdev->kernel_queues[0]; - for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) { + for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { if ((q->queue_type == QUEUE_TYPE_EXT || q->queue_type == QUEUE_TYPE_HW) && cs->jobs_in_queue_cnt[i]) { @@ -872,7 +875,7 @@ int hl_hw_queues_create(struct hl_device *hdev) struct hl_hw_queue *q; int i, rc, q_ready_cnt; - hdev->kernel_queues = kcalloc(HL_MAX_QUEUES, + hdev->kernel_queues = kcalloc(asic->max_queues, sizeof(*hdev->kernel_queues), GFP_KERNEL); if (!hdev->kernel_queues) { @@ -882,7 +885,7 @@ int hl_hw_queues_create(struct hl_device *hdev) /* Initialize the H/W queues */ for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues; - i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) { + i < asic->max_queues ; i++, q_ready_cnt++, q++) { q->queue_type = asic->hw_queues_props[i].type; q->supports_sync_stream = @@ -909,9 +912,10 @@ release_queues: void hl_hw_queues_destroy(struct hl_device *hdev) { struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; int i; - for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) queue_fini(hdev, q); kfree(hdev->kernel_queues); @@ -920,9 +924,10 @@ void hl_hw_queues_destroy(struct hl_device *hdev) void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) { struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; int i; - for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) { + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) { if ((!q->valid) || ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) continue; From 79b1894c419468607ce547855e8636d42b456149 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 24 Jun 2020 14:49:43 +0300 Subject: [PATCH 17/28] habanalabs: use queue pi/ci in order to determine queue occupancy Instead of using the free slots amount on the compute CQ to determine whether we can submit work to queues, use the queues pi/ci. This is needed in future ASICs where we don't have CQ per queue. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 17 +++--- drivers/misc/habanalabs/habanalabs.h | 2 +- drivers/misc/habanalabs/hw_queue.c | 82 +++++++++------------------- drivers/misc/habanalabs/irq.c | 7 +-- 4 files changed, 39 insertions(+), 69 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 2b38a119704c..65a5a5c52a48 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -1144,14 +1144,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) * because there the addresses of the completion queues are being * passed as arguments to request_irq */ - hdev->completion_queue = kcalloc(cq_cnt, - sizeof(*hdev->completion_queue), - GFP_KERNEL); + if (cq_cnt) { + hdev->completion_queue = kcalloc(cq_cnt, + sizeof(*hdev->completion_queue), + GFP_KERNEL); - if (!hdev->completion_queue) { - dev_err(hdev->dev, "failed to allocate completion queues\n"); - rc = -ENOMEM; - goto hw_queues_destroy; + if (!hdev->completion_queue) { + dev_err(hdev->dev, + "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } } for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 9213d107b533..a61aab09778c 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -461,7 +461,7 @@ struct hl_hw_queue { u64 kernel_address; dma_addr_t bus_address; u32 pi; - u32 ci; + atomic_t ci; u32 hw_queue_id; u32 cq_id; u32 msi_vec; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 7965551587fc..474a0e8a7797 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) ptr &= ((HL_QUEUE_LENGTH << 1) - 1); return ptr; } +static inline int queue_ci_get(atomic_t *ci, u32 queue_len) +{ + return atomic_read(ci) & ((queue_len << 1) - 1); +} static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) { - int delta = (q->pi - q->ci); + int delta = (q->pi - queue_ci_get(&q->ci, queue_len)); if (delta >= 0) return (queue_len - delta); @@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) struct hl_hw_queue *q; int i; - hdev->asic_funcs->hw_queues_lock(hdev); - if (hdev->disabled) - goto out; + return; q = &hdev->kernel_queues[0]; for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { - if (q->queue_type == QUEUE_TYPE_INT) { - q->ci += cs->jobs_in_queue_cnt[i]; - q->ci &= ((q->int_queue_len << 1) - 1); - } + if (q->queue_type == QUEUE_TYPE_INT) + atomic_add(cs->jobs_in_queue_cnt[i], &q->ci); } - -out: - hdev->asic_funcs->hw_queues_unlock(hdev); } /* @@ -174,38 +171,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev, } /* - * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue. + * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue * @hdev: Pointer to hl_device structure. * @q: Pointer to hl_hw_queue structure. * @num_of_entries: How many entries to check for space. * - * Perform the following: - * - Make sure we have enough space in the completion queue. - * This check also ensures that there is enough space in the h/w queue, as - * both queues are of the same size. - * - Reserve space in the completion queue (needs to be reversed if there - * is a failure down the road before the actual submission of work). + * Notice: We do not reserve queue entries so this function mustn't be called + * more than once per CS for the same queue * - * Both operations are done using the "free_slots_cnt" field of the completion - * queue. The CI counters of the queue and the completion queue are not - * needed/used for the H/W queue type. */ static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, int num_of_entries) { - atomic_t *free_slots = - &hdev->completion_queue[q->cq_id].free_slots_cnt; + int free_slots_cnt; - /* - * Check we have enough space in the completion queue. - * Add -1 to counter (decrement) unless counter was already 0. - * In that case, CQ is full so we can't submit a new CB. - * atomic_add_unless will return 0 if counter was already 0. - */ - if (atomic_add_negative(num_of_entries * -1, free_slots)) { - dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n", - num_of_entries, q->hw_queue_id); - atomic_add(num_of_entries, free_slots); + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); return -EAGAIN; } @@ -366,7 +351,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) { struct hl_device *hdev = job->cs->ctx->hdev; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; - struct hl_cq *cq; u64 ptr; u32 offset, ctl, len; @@ -395,17 +379,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) else ptr = (u64) (uintptr_t) job->user_cb; - /* - * No need to protect pi_offset because scheduling to the - * H/W queues is done under the scheduler mutex - * - * No need to check if CQ is full because it was already - * checked in hw_queue_sanity_checks - */ - cq = &hdev->completion_queue[q->cq_id]; - - cq->pi = hl_cq_inc_ptr(cq->pi); - ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); } @@ -552,8 +525,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) goto unroll_cq_resv; } - if (q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) + if (q->queue_type == QUEUE_TYPE_EXT) cq_cnt++; } } @@ -605,9 +577,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) unroll_cq_resv: q = &hdev->kernel_queues[0]; for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { - if ((q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) && - cs->jobs_in_queue_cnt[i]) { + if ((q->queue_type == QUEUE_TYPE_EXT) && + (cs->jobs_in_queue_cnt[i])) { atomic_t *free_slots = &hdev->completion_queue[i].free_slots_cnt; atomic_add(cs->jobs_in_queue_cnt[i], free_slots); @@ -631,7 +602,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) { struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; - q->ci = hl_queue_inc_ptr(q->ci); + atomic_inc(&q->ci); } static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, @@ -666,7 +637,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, } /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; return 0; @@ -700,7 +671,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; q->pi = 0; - q->ci = 0; + atomic_set(&q->ci, 0); return 0; } @@ -729,7 +700,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; return 0; @@ -931,7 +902,8 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) if ((!q->valid) || ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) continue; - q->pi = q->ci = 0; + q->pi = 0; + atomic_set(&q->ci, 0); if (q->supports_sync_stream) sync_stream_queue_reset(hdev, q->hw_queue_id); diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index 7a4878edb1a3..195a5ecba0e8 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -122,12 +122,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue_work(hdev->cq_wq, &job->finish_work); } - /* Update ci of the context's queue. There is no - * need to protect it with spinlock because this update is - * done only inside IRQ and there is a different IRQ per - * queue - */ - queue->ci = hl_queue_inc_ptr(queue->ci); + atomic_inc(&queue->ci); /* Clear CQ entry ready bit */ cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & From 9158c47e2059967038b19d051a1afd87954fbba4 Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Wed, 8 Jul 2020 00:29:54 +0300 Subject: [PATCH 18/28] habanalabs: remove unused hash Remove an old hash that is not in use anymore. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/habanalabs.h | 2 -- drivers/misc/habanalabs/mmu.c | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index a61aab09778c..ea0fd178accb 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -780,7 +780,6 @@ struct hl_va_range { * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area * descriptor (hl_vm_phys_pg_list or hl_userptr). - * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure. * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure. * @hpriv: pointer to the private (Kernel Driver) data of the process (fd). * @hdev: pointer to the device structure. @@ -814,7 +813,6 @@ struct hl_va_range { */ struct hl_ctx { DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS); - DECLARE_HASHTABLE(mmu_phys_hash, MMU_HASH_TABLE_BITS); DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS); struct hl_fpriv *hpriv; struct hl_device *hdev; diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c index a290d6b49d78..04303950e630 100644 --- a/drivers/misc/habanalabs/mmu.c +++ b/drivers/misc/habanalabs/mmu.c @@ -502,7 +502,6 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx) return 0; mutex_init(&ctx->mmu_lock); - hash_init(ctx->mmu_phys_hash); hash_init(ctx->mmu_shadow_hash); return dram_default_mapping_init(ctx); From c83c4171933bc4ebd147efb6bbdb787b25d1907d Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 5 Jul 2020 15:48:34 +0300 Subject: [PATCH 19/28] habanalabs: halt device CPU only upon certain reset Currently the driver halts the device CPU in the halt engines function, which halts all the engines of the ASIC. The problem is that if later on we stop the reset process (due to inability to clean memory mappings in time), the CPU will remain in halt mode. This creates many issues, such as thermal/power control and FLR handling. Therefore, move the halting of the device CPU to the very end of the reset process, just before writing to the registers to initiate the reset. In addition, the driver now needs to send a message to the device F/W to disable it from sending interrupts to the host machine because during halt engines function the driver disables the MSI/MSI-X interrupts. Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/device.c | 16 ++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 40 +++++++++++-------- drivers/misc/habanalabs/goya/goya.c | 38 +++++++++--------- .../include/gaudi/asic_reg/gaudi_regs.h | 1 + .../habanalabs/include/gaudi/gaudi_masks.h | 3 ++ 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 65a5a5c52a48..df709767c7ea 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -838,6 +838,22 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, if (rc) return 0; + if (hard_reset) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, + ARMCP_PACKET_DISABLE_PCI_ACCESS)) + dev_warn(hdev->dev, + "Failed to disable PCI access by F/W\n"); + } + /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 7eee4a10154b..a9fd3d352ef0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -2578,27 +2578,16 @@ static void gaudi_disable_timestamp(struct hl_device *hdev) static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) { - u32 wait_timeout_ms, cpu_timeout_ms; + u32 wait_timeout_ms; dev_info(hdev->dev, "Halting compute engines and disabling interrupts\n"); - if (hdev->pldm) { + if (hdev->pldm) wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; - cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; - } else { + else wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; - cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; - } - /* - * I don't know what is the state of the CPU so make sure it is - * stopped in any means necessary - */ - WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GAUDI_EVENT_HALT_MACHINE); - msleep(cpu_timeout_ms); gaudi_stop_mme_qmans(hdev); gaudi_stop_tpc_qmans(hdev); @@ -2966,17 +2955,34 @@ disable_queues: static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) { struct gaudi_device *gaudi = hdev->asic_specific; - u32 status, reset_timeout_ms, boot_strap = 0; + u32 status, reset_timeout_ms, cpu_timeout_ms, boot_strap = 0; if (!hard_reset) { dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); return; } - if (hdev->pldm) + if (hdev->pldm) { reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; - else + cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; + } else { reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; + cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; + } + + /* Set device to handle FLR by H/W as we will put the device CPU to + * halt mode + */ + WREG32(mmPCIE_AUX_FLR_CTRL, (PCIE_AUX_FLR_CTRL_HW_CTRL_MASK | + PCIE_AUX_FLR_CTRL_INT_MASK_MASK)); + + /* I don't know what is the state of the CPU so make sure it is + * stopped in any means necessary + */ + WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, GAUDI_EVENT_HALT_MACHINE); + + msleep(cpu_timeout_ms); /* Tell ASIC not to re-initialize PCIe */ WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 36db771f391c..2b0937d950c1 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2240,29 +2240,15 @@ static void goya_disable_timestamp(struct hl_device *hdev) static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) { - u32 wait_timeout_ms, cpu_timeout_ms; + u32 wait_timeout_ms; dev_info(hdev->dev, "Halting compute engines and disabling interrupts\n"); - if (hdev->pldm) { + if (hdev->pldm) wait_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; - cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; - } else { + else wait_timeout_ms = GOYA_RESET_WAIT_MSEC; - cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC; - } - - if (hard_reset) { - /* - * I don't know what is the state of the CPU so make sure it is - * stopped in any means necessary - */ - WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE); - WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, - GOYA_ASYNC_EVENT_ID_HALT_MACHINE); - msleep(cpu_timeout_ms); - } goya_stop_external_queues(hdev); goya_stop_internal_queues(hdev); @@ -2567,14 +2553,26 @@ disable_queues: static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) { struct goya_device *goya = hdev->asic_specific; - u32 reset_timeout_ms, status; + u32 reset_timeout_ms, cpu_timeout_ms, status; - if (hdev->pldm) + if (hdev->pldm) { reset_timeout_ms = GOYA_PLDM_RESET_TIMEOUT_MSEC; - else + cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; + } else { reset_timeout_ms = GOYA_RESET_TIMEOUT_MSEC; + cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC; + } if (hard_reset) { + /* I don't know what is the state of the CPU so make sure it is + * stopped in any means necessary + */ + WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_HALT_MACHINE); + + msleep(cpu_timeout_ms); + goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE); goya_disable_clk_rlx(hdev); goya_set_pll_refclk(hdev); diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h index 0c75d43532bd..f92dc53af074 100644 --- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h +++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h @@ -292,6 +292,7 @@ #define mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG 0xC02000 +#define mmPCIE_AUX_FLR_CTRL 0xC07394 #define mmPCIE_AUX_DBI 0xC07490 #endif /* ASIC_REG_GAUDI_REGS_H_ */ diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h index 96f08050ef0f..13ef6b2887fd 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h @@ -455,4 +455,7 @@ enum axi_id { QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\ QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK) +#define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1 +#define PCIE_AUX_FLR_CTRL_INT_MASK_MASK 0x2 + #endif /* GAUDI_MASKS_H_ */ From 5574cb2194b13de78df68cd32655ddbe619b1251 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 5 Jul 2020 13:35:51 +0300 Subject: [PATCH 20/28] habanalabs: Assign each CQ with its own work queue We identified a possible race during job completion when working with a single multi-threaded work queue. In order to overcome this race we suggest using a single threaded work queue per completion queue, hence we guarantee jobs completion in order. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 4 +- drivers/misc/habanalabs/device.c | 39 ++++++++++++++++---- drivers/misc/habanalabs/habanalabs.h | 7 +++- drivers/misc/habanalabs/irq.c | 2 +- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 7769a1aacca1..c605be89f764 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -487,10 +487,12 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) void hl_cs_rollback_all(struct hl_device *hdev) { + int i; struct hl_cs *cs, *tmp; /* flush all completions */ - flush_workqueue(hdev->cq_wq); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + flush_workqueue(hdev->cq_wq[i]); /* Make sure we don't have leftovers in the H/W queues mirror list */ list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list, diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index df709767c7ea..84800efec10d 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -249,7 +249,8 @@ static void device_cdev_sysfs_del(struct hl_device *hdev) */ static int device_early_init(struct hl_device *hdev) { - int rc; + int i, rc; + char workq_name[32]; switch (hdev->asic_type) { case ASIC_GOYA: @@ -274,11 +275,24 @@ static int device_early_init(struct hl_device *hdev) if (rc) goto early_fini; - hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0); - if (hdev->cq_wq == NULL) { - dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); - rc = -ENOMEM; - goto asid_fini; + if (hdev->asic_prop.completion_queues_count) { + hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, + sizeof(*hdev->cq_wq), + GFP_ATOMIC); + if (!hdev->cq_wq) { + rc = -ENOMEM; + goto asid_fini; + } + } + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { + snprintf(workq_name, 32, "hl-free-jobs-%u", i); + hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); + if (hdev->cq_wq == NULL) { + dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); + rc = -ENOMEM; + goto free_cq_wq; + } } hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); @@ -321,7 +335,10 @@ free_chip_info: free_eq_wq: destroy_workqueue(hdev->eq_wq); free_cq_wq: - destroy_workqueue(hdev->cq_wq); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + if (hdev->cq_wq[i]) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); asid_fini: hl_asid_fini(hdev); early_fini: @@ -339,6 +356,8 @@ early_fini: */ static void device_early_fini(struct hl_device *hdev) { + int i; + mutex_destroy(&hdev->mmu_cache_lock); mutex_destroy(&hdev->debug_lock); mutex_destroy(&hdev->send_cpu_message_lock); @@ -351,7 +370,10 @@ static void device_early_fini(struct hl_device *hdev) kfree(hdev->hl_chip_info); destroy_workqueue(hdev->eq_wq); - destroy_workqueue(hdev->cq_wq); + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); hl_asid_fini(hdev); @@ -1181,6 +1203,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) "failed to initialize completion queue\n"); goto cq_fini; } + hdev->completion_queue[i].cq_idx = i; } /* diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index ea0fd178accb..01fb45887a5a 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -479,6 +479,7 @@ struct hl_hw_queue { * @hdev: pointer to the device structure * @kernel_address: holds the queue's kernel virtual address * @bus_address: holds the queue's DMA address + * @cq_idx: completion queue index in array * @hw_queue_id: the id of the matching H/W queue * @ci: ci inside the queue * @pi: pi inside the queue @@ -488,6 +489,7 @@ struct hl_cq { struct hl_device *hdev; u64 kernel_address; dma_addr_t bus_address; + u32 cq_idx; u32 hw_queue_id; u32 ci; u32 pi; @@ -1396,7 +1398,8 @@ struct hl_device_idle_busy_ts { * @asic_name: ASIC specific nmae. * @asic_type: ASIC specific type. * @completion_queue: array of hl_cq. - * @cq_wq: work queue of completion queues for executing work in process context + * @cq_wq: work queues of completion queues for executing work in process + * context. * @eq_wq: work queue of event queue for executing work in process context. * @kernel_ctx: Kernel driver context structure. * @kernel_queues: array of hl_hw_queue. @@ -1492,7 +1495,7 @@ struct hl_device { char asic_name[16]; enum hl_asic_type asic_type; struct hl_cq *completion_queue; - struct workqueue_struct *cq_wq; + struct workqueue_struct **cq_wq; struct workqueue_struct *eq_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index 195a5ecba0e8..c8db717023f5 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -119,7 +119,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) if ((shadow_index_valid) && (!hdev->disabled)) { job = queue->shadow_queue[hl_pi_2_offset(shadow_index)]; - queue_work(hdev->cq_wq, &job->finish_work); + queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work); } atomic_inc(&queue->ci); From 22cb855598ed9aa7812941eb29218dc6033b4b43 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 8 Jul 2020 10:16:27 +0300 Subject: [PATCH 21/28] habanalabs: verify queue can contain all cs jobs In order for the user to be aware of wrong inputs, we must return error in case the amount of jobs per cs exceeds the corresponding queue size. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/habanalabs.h | 4 ++++ drivers/misc/habanalabs/hw_queue.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 01fb45887a5a..14def0d26d2d 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -421,6 +421,10 @@ struct hl_cs_job; #define HL_QUEUE_LENGTH 4096 #define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) +#if (HL_MAX_JOBS_PER_CS > HL_QUEUE_LENGTH) +#error "HL_QUEUE_LENGTH must be greater than HL_MAX_JOBS_PER_CS" +#endif + /* HL_CQ_LENGTH is in units of struct hl_cq_entry */ #define HL_CQ_LENGTH HL_QUEUE_LENGTH #define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 474a0e8a7797..287681646071 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -158,6 +158,13 @@ static int int_queue_sanity_checks(struct hl_device *hdev, { int free_slots_cnt; + if (num_of_entries > q->int_queue_len) { + dev_err(hdev->dev, + "Cannot populate queue %u with %u jobs\n", + q->hw_queue_id, num_of_entries); + return -ENOMEM; + } + /* Check we have enough space in the queue */ free_slots_cnt = queue_free_slots(q, q->int_queue_len); From a9855a2d91531001f1a952a042f17cc42ef30cb7 Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Wed, 24 Jun 2020 19:40:57 +0300 Subject: [PATCH 22/28] habanalabs: check for DMA errors when clearing memory In GAUDI we use QMAN0 DMA for clearing the MMU memory region at initialization. if this operation fails it places the DMA in an error state and then when trying to initialize QMAN0 we fail and erroneously assume its the QMAN that failed. This commit adds a check and clear of such DMA errors at initialization so we will have a better understanding of what went wrong. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index a9fd3d352ef0..57b2b9392cb2 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4253,7 +4253,7 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, { struct packet_lin_dma *lin_dma_pkt; struct hl_cs_job *job; - u32 cb_size, ctl; + u32 cb_size, ctl, err_cause; struct hl_cb *cb; int rc; @@ -4282,6 +4282,15 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, goto release_cb; } + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause && !hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + job->id = 0; job->user_cb = cb; job->user_cb->cs_cnt++; @@ -4293,11 +4302,23 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, hl_debugfs_add_job(hdev, job); rc = gaudi_send_job_on_qman0(hdev, job); - hl_debugfs_remove_job(hdev, job); kfree(job); cb->cs_cnt--; + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause) { + dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause); + rc = -EIO; + if (!hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + } + release_cb: hl_cb_put(cb); hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); From 70b2f993ea4a79c298aac4ec1c58089020536ba5 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 13 Jul 2020 12:21:04 +0300 Subject: [PATCH 23/28] habanalabs: create common folder For internal needs of our CI we need to move all the common code into a common folder instead of putting them in the root folder of the driver. Same applies to the common header files under include/ Signed-off-by: Oded Gabbay Reviewed-by: Omer Shpigelman --- drivers/misc/habanalabs/Makefile | 11 +++++------ drivers/misc/habanalabs/common/Makefile | 9 +++++++++ drivers/misc/habanalabs/{ => common}/asid.c | 0 drivers/misc/habanalabs/{ => common}/command_buffer.c | 0 .../misc/habanalabs/{ => common}/command_submission.c | 0 drivers/misc/habanalabs/{ => common}/context.c | 0 drivers/misc/habanalabs/{ => common}/debugfs.c | 0 drivers/misc/habanalabs/{ => common}/device.c | 0 drivers/misc/habanalabs/{ => common}/firmware_if.c | 2 +- drivers/misc/habanalabs/{ => common}/habanalabs.h | 4 ++-- drivers/misc/habanalabs/{ => common}/habanalabs_drv.c | 0 .../misc/habanalabs/{ => common}/habanalabs_ioctl.c | 0 drivers/misc/habanalabs/{ => common}/hw_queue.c | 0 drivers/misc/habanalabs/{ => common}/hwmon.c | 0 drivers/misc/habanalabs/{ => common}/irq.c | 0 drivers/misc/habanalabs/{ => common}/memory.c | 0 drivers/misc/habanalabs/{ => common}/mmu.c | 0 drivers/misc/habanalabs/{ => common}/pci.c | 0 drivers/misc/habanalabs/{ => common}/sysfs.c | 0 drivers/misc/habanalabs/gaudi/Makefile | 2 +- drivers/misc/habanalabs/gaudi/gaudiP.h | 2 +- drivers/misc/habanalabs/goya/goyaP.h | 2 +- .../misc/habanalabs/include/{ => common}/armcp_if.h | 0 .../misc/habanalabs/include/{ => common}/hl_boot_if.h | 0 .../misc/habanalabs/include/{ => common}/qman_if.h | 0 25 files changed, 20 insertions(+), 12 deletions(-) create mode 100644 drivers/misc/habanalabs/common/Makefile rename drivers/misc/habanalabs/{ => common}/asid.c (100%) rename drivers/misc/habanalabs/{ => common}/command_buffer.c (100%) rename drivers/misc/habanalabs/{ => common}/command_submission.c (100%) rename drivers/misc/habanalabs/{ => common}/context.c (100%) rename drivers/misc/habanalabs/{ => common}/debugfs.c (100%) rename drivers/misc/habanalabs/{ => common}/device.c (100%) rename drivers/misc/habanalabs/{ => common}/firmware_if.c (99%) rename drivers/misc/habanalabs/{ => common}/habanalabs.h (99%) rename drivers/misc/habanalabs/{ => common}/habanalabs_drv.c (100%) rename drivers/misc/habanalabs/{ => common}/habanalabs_ioctl.c (100%) rename drivers/misc/habanalabs/{ => common}/hw_queue.c (100%) rename drivers/misc/habanalabs/{ => common}/hwmon.c (100%) rename drivers/misc/habanalabs/{ => common}/irq.c (100%) rename drivers/misc/habanalabs/{ => common}/memory.c (100%) rename drivers/misc/habanalabs/{ => common}/mmu.c (100%) rename drivers/misc/habanalabs/{ => common}/pci.c (100%) rename drivers/misc/habanalabs/{ => common}/sysfs.c (100%) rename drivers/misc/habanalabs/include/{ => common}/armcp_if.h (100%) rename drivers/misc/habanalabs/include/{ => common}/hl_boot_if.h (100%) rename drivers/misc/habanalabs/include/{ => common}/qman_if.h (100%) diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile index 421ebd903069..a786c0a7de9a 100644 --- a/drivers/misc/habanalabs/Makefile +++ b/drivers/misc/habanalabs/Makefile @@ -3,16 +3,15 @@ # Makefile for HabanaLabs AI accelerators driver # -obj-m := habanalabs.o +obj-$(CONFIG_HABANA_AI) := habanalabs.o -habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \ - command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \ - command_submission.o mmu.o firmware_if.o pci.o - -habanalabs-$(CONFIG_DEBUG_FS) += debugfs.o +include $(src)/common/Makefile +habanalabs-y += $(HL_COMMON_FILES) include $(src)/goya/Makefile habanalabs-y += $(HL_GOYA_FILES) include $(src)/gaudi/Makefile habanalabs-y += $(HL_GAUDI_FILES) + +habanalabs-$(CONFIG_DEBUG_FS) += common/debugfs.o diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile new file mode 100644 index 000000000000..97d03b5c8683 --- /dev/null +++ b/drivers/misc/habanalabs/common/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +subdir-ccflags-y += -I$(src)/common + +HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \ + common/asid.o common/habanalabs_ioctl.o \ + common/command_buffer.o common/hw_queue.o common/irq.o \ + common/sysfs.o common/hwmon.o common/memory.o \ + common/command_submission.o common/mmu.o common/firmware_if.o \ + common/pci.o diff --git a/drivers/misc/habanalabs/asid.c b/drivers/misc/habanalabs/common/asid.c similarity index 100% rename from drivers/misc/habanalabs/asid.c rename to drivers/misc/habanalabs/common/asid.c diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c similarity index 100% rename from drivers/misc/habanalabs/command_buffer.c rename to drivers/misc/habanalabs/common/command_buffer.c diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c similarity index 100% rename from drivers/misc/habanalabs/command_submission.c rename to drivers/misc/habanalabs/common/command_submission.c diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/common/context.c similarity index 100% rename from drivers/misc/habanalabs/context.c rename to drivers/misc/habanalabs/common/context.c diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c similarity index 100% rename from drivers/misc/habanalabs/debugfs.c rename to drivers/misc/habanalabs/common/debugfs.c diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/common/device.c similarity index 100% rename from drivers/misc/habanalabs/device.c rename to drivers/misc/habanalabs/common/device.c diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c similarity index 99% rename from drivers/misc/habanalabs/firmware_if.c rename to drivers/misc/habanalabs/common/firmware_if.c index 3be1549cd137..b2b84510b932 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -6,7 +6,7 @@ */ #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include #include diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h similarity index 99% rename from drivers/misc/habanalabs/habanalabs.h rename to drivers/misc/habanalabs/common/habanalabs.h index 14def0d26d2d..82532f1f94cb 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -8,8 +8,8 @@ #ifndef HABANALABSP_H_ #define HABANALABSP_H_ -#include "include/armcp_if.h" -#include "include/qman_if.h" +#include "include/common/armcp_if.h" +#include "include/common/qman_if.h" #include #include diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c similarity index 100% rename from drivers/misc/habanalabs/habanalabs_drv.c rename to drivers/misc/habanalabs/common/habanalabs_drv.c diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c similarity index 100% rename from drivers/misc/habanalabs/habanalabs_ioctl.c rename to drivers/misc/habanalabs/common/habanalabs_ioctl.c diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c similarity index 100% rename from drivers/misc/habanalabs/hw_queue.c rename to drivers/misc/habanalabs/common/hw_queue.c diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c similarity index 100% rename from drivers/misc/habanalabs/hwmon.c rename to drivers/misc/habanalabs/common/hwmon.c diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/common/irq.c similarity index 100% rename from drivers/misc/habanalabs/irq.c rename to drivers/misc/habanalabs/common/irq.c diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/common/memory.c similarity index 100% rename from drivers/misc/habanalabs/memory.c rename to drivers/misc/habanalabs/common/memory.c diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/common/mmu.c similarity index 100% rename from drivers/misc/habanalabs/mmu.c rename to drivers/misc/habanalabs/common/mmu.c diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/common/pci.c similarity index 100% rename from drivers/misc/habanalabs/pci.c rename to drivers/misc/habanalabs/common/pci.c diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c similarity index 100% rename from drivers/misc/habanalabs/sysfs.c rename to drivers/misc/habanalabs/common/sysfs.c diff --git a/drivers/misc/habanalabs/gaudi/Makefile b/drivers/misc/habanalabs/gaudi/Makefile index f802cdc980ca..75104ae74e2b 100644 --- a/drivers/misc/habanalabs/gaudi/Makefile +++ b/drivers/misc/habanalabs/gaudi/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -subdir-ccflags-y += -I$(src) +subdir-ccflags-y += -I$(src)/common HL_GAUDI_FILES := gaudi/gaudi.o gaudi/gaudi_hwmgr.o gaudi/gaudi_security.o \ gaudi/gaudi_coresight.o diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index bdc5f96085a7..a94ab6a180f0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -10,7 +10,7 @@ #include #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include "include/gaudi/gaudi_packets.h" #include "include/gaudi/gaudi.h" #include "include/gaudi/gaudi_async_events.h" diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index 8265cc21b45a..9e674cf39fd9 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -10,7 +10,7 @@ #include #include "habanalabs.h" -#include "include/hl_boot_if.h" +#include "include/common/hl_boot_if.h" #include "include/goya/goya_packets.h" #include "include/goya/goya.h" #include "include/goya/goya_async_events.h" diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/common/armcp_if.h similarity index 100% rename from drivers/misc/habanalabs/include/armcp_if.h rename to drivers/misc/habanalabs/include/common/armcp_if.h diff --git a/drivers/misc/habanalabs/include/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h similarity index 100% rename from drivers/misc/habanalabs/include/hl_boot_if.h rename to drivers/misc/habanalabs/include/common/hl_boot_if.h diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/common/qman_if.h similarity index 100% rename from drivers/misc/habanalabs/include/qman_if.h rename to drivers/misc/habanalabs/include/common/qman_if.h From eb8b293e794bbbafa9d615ea939982a19bf92867 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 13 Jul 2020 13:11:33 +0300 Subject: [PATCH 24/28] habanalabs: update hl_boot_if.h from firmware Update the boot interface file from the latest version from firmware. Defines for secure boot were added. Signed-off-by: Oded Gabbay Reviewed-by: Omer Shpigelman --- .../misc/habanalabs/include/common/hl_boot_if.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index c22d134e73af..bb67cafc6e00 100644 --- a/drivers/misc/habanalabs/include/common/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -44,6 +44,15 @@ * The NIC FW loading and initialization * failed. This means NICs are not usable. * + * CPU_BOOT_ERR0_SECURITY_NOT_RDY Chip security initialization has been + * started, but is not ready yet - chip + * cannot be accessed. + * + * CPU_BOOT_ERR0_SECURITY_FAIL Security related tasks have failed. + * The tasks are security init (root of + * trust), boot authentication (chain of + * trust), data packets authentication. + * * CPU_BOOT_ERR0_ENABLED Error registers enabled. * This is a main indication that the * running FW populates the error @@ -57,6 +66,8 @@ #define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED (1 << 4) #define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY (1 << 5) #define CPU_BOOT_ERR0_NIC_FW_FAIL (1 << 6) +#define CPU_BOOT_ERR0_SECURITY_NOT_RDY (1 << 7) +#define CPU_BOOT_ERR0_SECURITY_FAIL (1 << 8) #define CPU_BOOT_ERR0_ENABLED (1 << 31) enum cpu_boot_status { @@ -79,7 +90,10 @@ enum cpu_boot_status { CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */ /* Last boot loader progress status, ready to receive commands */ CPU_BOOT_STATUS_READY_TO_BOOT = 15, + /* Internal Boot finished, ready for boot-fit */ CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16, + /* Internal Security has been initialized, device can be accessed */ + CPU_BOOT_STATUS_SECURITY_READY = 17, }; enum kmd_msg { From a04b7cd97eef13a489ca44c979cf91e24cfa7b55 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 13 Jul 2020 13:36:55 +0300 Subject: [PATCH 25/28] habanalabs: create internal CB pool Create a device MMU-mapped internal command buffer pool, in order to allow the driver to allocate CBs for the signal/wait operations that are fetched by the queues when they are configured with the user's address space ID. We must pre-map this internal pool due to performance issues. This pool is needed for future ASIC support and it is currently unused in GOYA and GAUDI. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_buffer.c | 78 ++++++++++++------- .../habanalabs/common/command_submission.c | 13 ++-- drivers/misc/habanalabs/common/context.c | 8 ++ drivers/misc/habanalabs/common/habanalabs.h | 18 ++++- drivers/misc/habanalabs/gaudi/gaudi.c | 20 +++-- drivers/misc/habanalabs/goya/goya.c | 18 +++-- 6 files changed, 104 insertions(+), 51 deletions(-) diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c index 02d13f71b1df..7c38c4f7f9c0 100644 --- a/drivers/misc/habanalabs/common/command_buffer.c +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -10,12 +10,18 @@ #include #include +#include static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) { - hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, - (void *) (uintptr_t) cb->kernel_address, - cb->bus_address); + if (cb->is_internal) + gen_pool_free(hdev->internal_cb_pool, + cb->kernel_address, cb->size); + else + hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, + (void *) (uintptr_t) cb->kernel_address, + cb->bus_address); + kfree(cb); } @@ -44,9 +50,10 @@ static void cb_release(struct kref *ref) } static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, - int ctx_id) + int ctx_id, bool internal_cb) { struct hl_cb *cb; + u32 cb_offset; void *p; /* @@ -65,13 +72,25 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, if (!cb) return NULL; - if (ctx_id == HL_KERNEL_ASID_ID) + if (internal_cb) { + p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size); + if (!p) { + kfree(cb); + return NULL; + } + + cb_offset = p - hdev->internal_cb_pool_virt_addr; + cb->is_internal = true; + cb->bus_address = hdev->internal_cb_va_base + cb_offset; + } else if (ctx_id == HL_KERNEL_ASID_ID) { p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_ATOMIC); - else + } else { p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_USER | __GFP_ZERO); + } + if (!p) { dev_err(hdev->dev, "failed to allocate %d of dma memory for CB\n", @@ -87,7 +106,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, } int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, - u32 cb_size, u64 *handle, int ctx_id) + u32 cb_size, u64 *handle, int ctx_id, bool internal_cb) { struct hl_cb *cb; bool alloc_new_cb = true; @@ -112,28 +131,30 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, goto out_err; } - /* Minimum allocation must be PAGE SIZE */ - if (cb_size < PAGE_SIZE) - cb_size = PAGE_SIZE; + if (!internal_cb) { + /* Minimum allocation must be PAGE SIZE */ + if (cb_size < PAGE_SIZE) + cb_size = PAGE_SIZE; - if (ctx_id == HL_KERNEL_ASID_ID && - cb_size <= hdev->asic_prop.cb_pool_cb_size) { + if (ctx_id == HL_KERNEL_ASID_ID && + cb_size <= hdev->asic_prop.cb_pool_cb_size) { - spin_lock(&hdev->cb_pool_lock); - if (!list_empty(&hdev->cb_pool)) { - cb = list_first_entry(&hdev->cb_pool, typeof(*cb), - pool_list); - list_del(&cb->pool_list); - spin_unlock(&hdev->cb_pool_lock); - alloc_new_cb = false; - } else { - spin_unlock(&hdev->cb_pool_lock); - dev_dbg(hdev->dev, "CB pool is empty\n"); + spin_lock(&hdev->cb_pool_lock); + if (!list_empty(&hdev->cb_pool)) { + cb = list_first_entry(&hdev->cb_pool, + typeof(*cb), pool_list); + list_del(&cb->pool_list); + spin_unlock(&hdev->cb_pool_lock); + alloc_new_cb = false; + } else { + spin_unlock(&hdev->cb_pool_lock); + dev_dbg(hdev->dev, "CB pool is empty\n"); + } } } if (alloc_new_cb) { - cb = hl_cb_alloc(hdev, cb_size, ctx_id); + cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb); if (!cb) { rc = -ENOMEM; goto out_err; @@ -229,8 +250,8 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) rc = -EINVAL; } else { rc = hl_cb_create(hdev, &hpriv->cb_mgr, - args->in.cb_size, &handle, - hpriv->ctx->asid); + args->in.cb_size, &handle, + hpriv->ctx->asid, false); } memset(args, 0, sizeof(*args)); @@ -398,14 +419,15 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr) idr_destroy(&mgr->cb_handles); } -struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size) +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb) { u64 cb_handle; struct hl_cb *cb; int rc; rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle, - HL_KERNEL_ASID_ID); + HL_KERNEL_ASID_ID, internal_cb); if (rc) { dev_err(hdev->dev, "Failed to allocate CB for the kernel driver %d\n", rc); @@ -437,7 +459,7 @@ int hl_cb_pool_init(struct hl_device *hdev) for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) { cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size, - HL_KERNEL_ASID_ID); + HL_KERNEL_ASID_ID, false); if (cb) { cb->is_pool = true; list_add(&cb->pool_list, &hdev->cb_pool); diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index c605be89f764..e096532c0e48 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -919,7 +919,13 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, goto put_cs; } - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (cs->type == CS_TYPE_WAIT) + cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); + else + cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); + + cb = hl_cb_kernel_create(hdev, cb_size, + q_type == QUEUE_TYPE_HW && hdev->mmu_enable); if (!cb) { ctx->cs_counters.out_of_mem_drop_cnt++; kfree(job); @@ -927,11 +933,6 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, goto put_cs; } - if (cs->type == CS_TYPE_WAIT) - cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); - else - cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); - job->id = 0; job->cs = cs; job->user_cb = cb; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 1e3e5b19ecd9..b75a20364fad 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -153,10 +153,18 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) rc = -ENOMEM; goto mem_ctx_err; } + + rc = hdev->asic_funcs->ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "ctx_init failed\n"); + goto ctx_init_err; + } } return 0; +ctx_init_err: + hl_vm_ctx_fini(ctx); mem_ctx_err: if (ctx->asid != HL_KERNEL_ASID_ID) hl_asid_free(hdev, ctx->asid); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 82532f1f94cb..bf9abfa47b7a 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -392,6 +392,7 @@ struct hl_cb_mgr { * @ctx_id: holds the ID of the owner's context. * @mmap: true if the CB is currently mmaped to user. * @is_pool: true if CB was acquired from the pool, false otherwise. + * @is_internal: internaly allocated */ struct hl_cb { struct kref refcount; @@ -408,6 +409,7 @@ struct hl_cb { u32 ctx_id; u8 mmap; u8 is_pool; + u8 is_internal; }; @@ -643,6 +645,7 @@ enum div_select_defs { * @rreg: Read a register. Needed for simulator support. * @wreg: Write a register. Needed for simulator support. * @halt_coresight: stop the ETF and ETR traces. + * @ctx_init: context dependent initialization. * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index. * @read_device_fw_version: read the device's firmware versions that are @@ -745,6 +748,7 @@ struct hl_asic_funcs { u32 (*rreg)(struct hl_device *hdev, u32 reg); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); void (*halt_coresight)(struct hl_device *hdev); + int (*ctx_init)(struct hl_ctx *ctx); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx); void (*read_device_fw_version)(struct hl_device *hdev, @@ -1432,6 +1436,10 @@ struct hl_device_idle_busy_ts { * @hl_debugfs: device's debugfs manager. * @cb_pool: list of preallocated CBs. * @cb_pool_lock: protects the CB pool. + * @internal_cb_pool_virt_addr: internal command buffer pool virtual address. + * @internal_cb_pool_dma_addr: internal command buffer pool dma address. + * @internal_cb_pool: internal command buffer memory pool. + * @internal_cb_va_base: internal cb pool mmu virtual address base * @fpriv_list: list of file private data structures. Each structure is created * when a user opens the device * @fpriv_list_lock: protects the fpriv_list @@ -1531,6 +1539,11 @@ struct hl_device { struct list_head cb_pool; spinlock_t cb_pool_lock; + void *internal_cb_pool_virt_addr; + dma_addr_t internal_cb_pool_dma_addr; + struct gen_pool *internal_cb_pool; + u64 internal_cb_va_base; + struct list_head fpriv_list; struct mutex fpriv_list_lock; @@ -1741,7 +1754,7 @@ int hl_hwmon_init(struct hl_device *hdev); void hl_hwmon_fini(struct hl_device *hdev); int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size, - u64 *handle, int ctx_id); + u64 *handle, int ctx_id, bool internal_cb); int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle); int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, @@ -1749,7 +1762,8 @@ struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, void hl_cb_put(struct hl_cb *cb); void hl_cb_mgr_init(struct hl_cb_mgr *mgr); void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr); -struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size); +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb); int hl_cb_pool_init(struct hl_device *hdev); int hl_cb_pool_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 57b2b9392cb2..86cfaf73ad74 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -635,7 +635,7 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev, u8 tpc_id; int rc; - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); if (!cb) return -EFAULT; @@ -4048,9 +4048,8 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev, parser->patched_cb_size = parser->user_cb_size + sizeof(struct packet_msg_prot) * 2; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, @@ -4122,9 +4121,8 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev, if (rc) goto free_userptr; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, "Failed to allocate patched CB for DMA CS %d\n", rc); @@ -4257,7 +4255,7 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, struct hl_cb *cb; int rc; - cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); if (!cb) return -EFAULT; @@ -6229,6 +6227,11 @@ static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev) return RREG32(mmHW_STATE); } +int gaudi_ctx_init(struct hl_ctx *ctx) +{ + return 0; +} + static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) { return gaudi_cq_assignment[cq_idx]; @@ -6532,6 +6535,7 @@ static const struct hl_asic_funcs gaudi_funcs = { .rreg = hl_rreg, .wreg = hl_wreg, .halt_coresight = gaudi_halt_coresight, + .ctx_init = gaudi_ctx_init, .get_clk_rate = gaudi_get_clk_rate, .get_queue_id_for_cq = gaudi_get_queue_id_for_cq, .read_device_fw_version = gaudi_read_device_fw_version, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 2b0937d950c1..4473ded313d6 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3771,9 +3771,8 @@ static int goya_parse_cb_mmu(struct hl_device *hdev, parser->patched_cb_size = parser->user_cb_size + sizeof(struct packet_msg_prot) * 2; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, @@ -3845,9 +3844,8 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev, if (rc) goto free_userptr; - rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, - parser->patched_cb_size, - &patched_cb_handle, HL_KERNEL_ASID_ID); + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID, false); if (rc) { dev_err(hdev->dev, "Failed to allocate patched CB for DMA CS %d\n", rc); @@ -4693,7 +4691,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, lin_dma_pkts_cnt = DIV_ROUND_UP_ULL(size, SZ_2G); cb_size = lin_dma_pkts_cnt * sizeof(struct packet_lin_dma) + sizeof(struct packet_msg_prot); - cb = hl_cb_kernel_create(hdev, cb_size); + cb = hl_cb_kernel_create(hdev, cb_size, false); if (!cb) return -ENOMEM; @@ -5223,6 +5221,11 @@ static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev) return RREG32(mmHW_STATE); } +int goya_ctx_init(struct hl_ctx *ctx) +{ + return 0; +} + u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) { return cq_idx; @@ -5336,6 +5339,7 @@ static const struct hl_asic_funcs goya_funcs = { .rreg = hl_rreg, .wreg = hl_wreg, .halt_coresight = goya_halt_coresight, + .ctx_init = goya_ctx_init, .get_clk_rate = goya_get_clk_rate, .get_queue_id_for_cq = goya_get_queue_id_for_cq, .read_device_fw_version = goya_read_device_fw_version, From 8df8cb1efc1962356c97656839ea9c41e1ed4ba9 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 15 Jul 2020 21:59:32 +0300 Subject: [PATCH 26/28] habanalabs: enable device before hw_init() Device is now enabled before the hw_init() because part of the initialization requires communication with the device firmware to get information that is required for the initialization itself Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/common/device.c | 16 ++++++++++++---- drivers/misc/habanalabs/common/sysfs.c | 3 +++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 84800efec10d..9919ff121067 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1033,6 +1033,12 @@ again: } } + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + rc = hdev->asic_funcs->hw_init(hdev); if (rc) { dev_err(hdev->dev, @@ -1040,8 +1046,6 @@ again: goto out_err; } - hdev->disabled = false; - /* Check that the communication with the device is working */ rc = hdev->asic_funcs->test_queues(hdev); if (rc) { @@ -1261,6 +1265,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) */ add_cdev_sysfs_on_err = true; + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + rc = hdev->asic_funcs->hw_init(hdev); if (rc) { dev_err(hdev->dev, "failed to initialize the H/W\n"); @@ -1268,8 +1278,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) goto out_disabled; } - hdev->disabled = false; - /* Check that the communication with the device is working */ rc = hdev->asic_funcs->test_queues(hdev); if (rc) { diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 5d78d5e1c782..c4e7c682d584 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -334,6 +334,9 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj, char *data; int rc; + if (hl_device_disabled_or_in_reset(hdev)) + return -ENODEV; + if (!max_size) return -EINVAL; From 644883ef1aa53af1efecf3f84df9951c448b876b Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 19 Jul 2020 11:00:03 +0300 Subject: [PATCH 27/28] habanalabs: use no flags on MMU cache invalidation gaudi_mmu_invalidate_cache() doesn't use the flags parameter, and thus it can be set to 0 when the function is called in the gaudi only files. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 86cfaf73ad74..4a1a52608fc0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -2646,8 +2646,7 @@ static int gaudi_mmu_init(struct hl_device *hdev) WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8); WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, - VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0); WREG32(mmMMU_UP_MMU_ENABLE, 1); WREG32(mmMMU_UP_SPI_MASK, 0xF); From 94f8be9eb065412cf069efd45053d33e8911fa9e Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 23 Jul 2020 09:17:57 +0300 Subject: [PATCH 28/28] habanalabs: Fix memory leak in error flow of context initialization Add a missing free of the cs_pending array in the error flow of context initialization. Fixes: c16d45f42b64 ("habanalabs: Use pending CS amount per ASIC") Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index b75a20364fad..3e375958e73b 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -138,36 +138,38 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) rc = hl_mmu_ctx_init(ctx); if (rc) { dev_err(hdev->dev, "Failed to init mmu ctx module\n"); - goto mem_ctx_err; + goto err_free_cs_pending; } } else { ctx->asid = hl_asid_alloc(hdev); if (!ctx->asid) { dev_err(hdev->dev, "No free ASID, failed to create context\n"); - return -ENOMEM; + rc = -ENOMEM; + goto err_free_cs_pending; } rc = hl_vm_ctx_init(ctx); if (rc) { dev_err(hdev->dev, "Failed to init mem ctx module\n"); rc = -ENOMEM; - goto mem_ctx_err; + goto err_asid_free; } rc = hdev->asic_funcs->ctx_init(ctx); if (rc) { dev_err(hdev->dev, "ctx_init failed\n"); - goto ctx_init_err; + goto err_vm_ctx_fini; } } return 0; -ctx_init_err: +err_vm_ctx_fini: hl_vm_ctx_fini(ctx); -mem_ctx_err: - if (ctx->asid != HL_KERNEL_ASID_ID) - hl_asid_free(hdev, ctx->asid); +err_asid_free: + hl_asid_free(hdev, ctx->asid); +err_free_cs_pending: + kfree(ctx->cs_pending); return rc; }