From 508c5849c62d009e03f37ad0f556071fac5112f0 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 1 Aug 2019 13:57:36 +0000 Subject: [PATCH 1/6] habanalabs: Avoid double free in error flow In case kernel context init fails during device initialization, both hl_ctx_put() and kfree() are called, ending with a double free of the kernel context. Calling kfree() is needed only when a failure happens between the allocation of the kernel context and its initialization, so move it to there and remove it from the error flow. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 0c4894dd9c02..7a8f9d0b71b5 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -970,7 +970,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { dev_err(hdev->dev, "failed to initialize kernel context\n"); - goto free_ctx; + kfree(hdev->kernel_ctx); + goto mmu_fini; } rc = hl_cb_pool_init(hdev); @@ -1053,8 +1054,6 @@ release_ctx: if (hl_ctx_put(hdev->kernel_ctx) != 1) dev_err(hdev->dev, "kernel ctx is still alive on initialization failure\n"); -free_ctx: - kfree(hdev->kernel_ctx); mmu_fini: hl_mmu_fini(hdev); eq_fini: From c8113756ba27298d6e95403c087dc5881b419a99 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 4 Aug 2019 07:03:41 +0000 Subject: [PATCH 2/6] habanalabs: fix DRAM usage accounting on context tear down The patch fix the DRAM usage accounting by adding a missing update of the DRAM memory consumption, when a context is being torn down without an organized release of the allocated memory. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 42d237cae1dc..365fb0cb8dff 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -1629,6 +1629,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) dev_dbg(hdev->dev, "page list 0x%p of asid %d is still alive\n", phys_pg_list, ctx->asid); + atomic64_sub(phys_pg_list->total_size, + &hdev->dram_used_mem); free_phys_pg_pack(hdev, phys_pg_list); idr_remove(&vm->phys_pg_pack_handles, i); } From 213ad5ad016a0da975b35f54e8cd236c3b04724b Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Thu, 1 Aug 2019 23:20:32 +0000 Subject: [PATCH 3/6] habanalabs: fix endianness handling for packets from user Packets that arrive from the user and need to be parsed by the driver are assumed to be in LE format. This patch fix all the places where the code handles these packets and use the correct endianness macros to handle them, as the driver handles the packets in CPU format (LE or BE depending on the arch). Signed-off-by: Ben Segal Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 32 +++++++++++-------- .../habanalabs/include/goya/goya_packets.h | 13 ++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index a0e181714891..e8b1142910e0 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3428,12 +3428,13 @@ static int goya_validate_cb(struct hl_device *hdev, while (cb_parsed_length < parser->user_cb_size) { enum packet_id pkt_id; u16 pkt_size; - void *user_pkt; + struct goya_packet *user_pkt; - user_pkt = (void *) (uintptr_t) + user_pkt = (struct goya_packet *) (uintptr_t) (parser->user_cb->kernel_address + cb_parsed_length); - pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & PACKET_HEADER_PACKET_ID_MASK) >> PACKET_HEADER_PACKET_ID_SHIFT); @@ -3453,7 +3454,8 @@ static int goya_validate_cb(struct hl_device *hdev, * need to validate here as well because patch_cb() is * not called in MMU path while this function is called */ - rc = goya_validate_wreg32(hdev, parser, user_pkt); + rc = goya_validate_wreg32(hdev, + parser, (struct packet_wreg32 *) user_pkt); break; case PACKET_WREG_BULK: @@ -3481,10 +3483,10 @@ static int goya_validate_cb(struct hl_device *hdev, case PACKET_LIN_DMA: if (is_mmu) rc = goya_validate_dma_pkt_mmu(hdev, parser, - user_pkt); + (struct packet_lin_dma *) user_pkt); else rc = goya_validate_dma_pkt_no_mmu(hdev, parser, - user_pkt); + (struct packet_lin_dma *) user_pkt); break; case PACKET_MSG_LONG: @@ -3657,15 +3659,16 @@ static int goya_patch_cb(struct hl_device *hdev, enum packet_id pkt_id; u16 pkt_size; u32 new_pkt_size = 0; - void *user_pkt, *kernel_pkt; + struct goya_packet *user_pkt, *kernel_pkt; - user_pkt = (void *) (uintptr_t) + user_pkt = (struct goya_packet *) (uintptr_t) (parser->user_cb->kernel_address + cb_parsed_length); - kernel_pkt = (void *) (uintptr_t) + kernel_pkt = (struct goya_packet *) (uintptr_t) (parser->patched_cb->kernel_address + cb_patched_cur_length); - pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & PACKET_HEADER_PACKET_ID_MASK) >> PACKET_HEADER_PACKET_ID_SHIFT); @@ -3680,15 +3683,18 @@ static int goya_patch_cb(struct hl_device *hdev, switch (pkt_id) { case PACKET_LIN_DMA: - rc = goya_patch_dma_packet(hdev, parser, user_pkt, - kernel_pkt, &new_pkt_size); + rc = goya_patch_dma_packet(hdev, parser, + (struct packet_lin_dma *) user_pkt, + (struct packet_lin_dma *) kernel_pkt, + &new_pkt_size); cb_patched_cur_length += new_pkt_size; break; case PACKET_WREG_32: memcpy(kernel_pkt, user_pkt, pkt_size); cb_patched_cur_length += pkt_size; - rc = goya_validate_wreg32(hdev, parser, kernel_pkt); + rc = goya_validate_wreg32(hdev, parser, + (struct packet_wreg32 *) kernel_pkt); break; case PACKET_WREG_BULK: diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h index a14407b975e4..ef54bad20509 100644 --- a/drivers/misc/habanalabs/include/goya/goya_packets.h +++ b/drivers/misc/habanalabs/include/goya/goya_packets.h @@ -52,6 +52,19 @@ enum goya_dma_direction { #define GOYA_PKT_CTL_MB_SHIFT 31 #define GOYA_PKT_CTL_MB_MASK 0x80000000 +/* All packets have, at least, an 8-byte header, which contains + * the packet type. The kernel driver uses the packet header for packet + * validation and to perform any necessary required preparation before + * sending them off to the hardware. + */ +struct goya_packet { + __le64 header; + /* The rest of the packet data follows. Use the corresponding + * packet_XXX struct to deference the data, based on packet type + */ + u8 contents[0]; +}; + struct packet_nop { __le32 reserved; __le32 ctl; From 4e87334a0ef43663019dbaf3638ad10fd8c3320c Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Thu, 1 Aug 2019 23:22:20 +0000 Subject: [PATCH 4/6] habanalabs: fix completion queue handling when host is BE This patch fix the CQ irq handler to work in hosts with BE architecture. It adds the correct endian-swapping macros around the relevant memory accesses. Signed-off-by: Ben Segal Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/irq.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index ea9f72ff456c..199791b57caf 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -80,8 +80,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) struct hl_cs_job *job; bool shadow_index_valid; u16 shadow_index; - u32 *cq_entry; - u32 *cq_base; + struct hl_cq_entry *cq_entry, *cq_base; if (hdev->disabled) { dev_dbg(hdev->dev, @@ -90,29 +89,29 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) return IRQ_HANDLED; } - cq_base = (u32 *) (uintptr_t) cq->kernel_address; + cq_base = (struct hl_cq_entry *) (uintptr_t) cq->kernel_address; while (1) { - bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK) + bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) & + CQ_ENTRY_READY_MASK) >> CQ_ENTRY_READY_SHIFT); if (!entry_ready) break; - cq_entry = (u32 *) &cq_base[cq->ci]; + cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci]; - /* - * Make sure we read CQ entry contents after we've + /* Make sure we read CQ entry contents after we've * checked the ownership bit. */ dma_rmb(); - shadow_index_valid = - ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK) + shadow_index_valid = ((le32_to_cpu(cq_entry->data) & + CQ_ENTRY_SHADOW_INDEX_VALID_MASK) >> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT); - shadow_index = (u16) - ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK) + shadow_index = (u16) ((le32_to_cpu(cq_entry->data) & + CQ_ENTRY_SHADOW_INDEX_MASK) >> CQ_ENTRY_SHADOW_INDEX_SHIFT); queue = &hdev->kernel_queues[cq->hw_queue_id]; @@ -122,8 +121,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue_work(hdev->cq_wq, &job->finish_work); } - /* - * Update ci of the context's queue. There is no + /* Update ci of the context's queue. There is no * need to protect it with spinlock because this update is * done only inside IRQ and there is a different IRQ per * queue @@ -131,7 +129,8 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue->ci = hl_queue_inc_ptr(queue->ci); /* Clear CQ entry ready bit */ - cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK; + cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & + ~CQ_ENTRY_READY_MASK); cq->ci = hl_cq_inc_ptr(cq->ci); From b9040c99414ba5b85090595a61abc686a5dbb388 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 8 Aug 2019 15:45:58 +0300 Subject: [PATCH 5/6] habanalabs: fix endianness handling for internal QMAN submission The PQs of internal H/W queues (QMANs) can be located in different memory areas for different ASICs. Therefore, when writing PQEs, we need to use the correct function according to the location of the PQ. e.g. if the PQ is located in the device's memory (SRAM or DRAM), we need to use memcpy_toio() so it would work in architectures that have separate address ranges for IO memory. This patch makes the code that writes the PQE to be ASIC-specific so we can handle this properly per ASIC. Signed-off-by: Oded Gabbay Tested-by: Ben Segal --- drivers/misc/habanalabs/goya/goya.c | 7 ++++--- drivers/misc/habanalabs/goya/goyaP.h | 2 +- drivers/misc/habanalabs/habanalabs.h | 9 +++++++-- drivers/misc/habanalabs/hw_queue.c | 14 +++++--------- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index e8b1142910e0..b39b9c98fe1d 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2729,9 +2729,10 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) GOYA_ASYNC_EVENT_ID_PI_UPDATE); } -void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val) +void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd) { - /* Not needed in Goya */ + /* The QMANs are on the SRAM so need to copy to IO space */ + memcpy_toio((void __iomem *) pqe, bd, sizeof(struct hl_bd)); } static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, @@ -5048,7 +5049,7 @@ static const struct hl_asic_funcs goya_funcs = { .resume = goya_resume, .cb_mmap = goya_cb_mmap, .ring_doorbell = goya_ring_doorbell, - .flush_pq_write = goya_flush_pq_write, + .pqe_write = goya_pqe_write, .asic_dma_alloc_coherent = goya_dma_alloc_coherent, .asic_dma_free_coherent = goya_dma_free_coherent, .get_int_queue_base = goya_get_int_queue_base, diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index f8c611883dc1..d7f48c9c41cd 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -177,7 +177,7 @@ int goya_late_init(struct hl_device *hdev); void goya_late_fini(struct hl_device *hdev); void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi); -void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val); +void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd); void goya_update_eq_ci(struct hl_device *hdev, u32 val); void goya_restore_phase_topology(struct hl_device *hdev); int goya_context_switch(struct hl_device *hdev, u32 asid); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 6a4c64b97f38..ce83adafcf2d 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -441,7 +441,11 @@ enum hl_pll_frequency { * @resume: handles IP specific H/W or SW changes for resume. * @cb_mmap: maps a CB. * @ring_doorbell: increment PI on a given QMAN. - * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. + * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific + * function because the PQs are located in different memory areas + * per ASIC (SRAM, DRAM, Host memory) and therefore, the method of + * writing the PQE must match the destination memory area + * properties. * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling * dma_alloc_coherent(). This is ASIC function because * its implementation is not trivial when the driver @@ -510,7 +514,8 @@ struct hl_asic_funcs { int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, u64 kaddress, phys_addr_t paddress, u32 size); void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); - void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); + void (*pqe_write)(struct hl_device *hdev, __le64 *pqe, + struct hl_bd *bd); void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size, diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index e3b5517897ea..5f5673b74985 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -290,23 +290,19 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job) struct hl_device *hdev = job->cs->ctx->hdev; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; struct hl_bd bd; - u64 *pi, *pbd = (u64 *) &bd; + __le64 *pi; bd.ctl = 0; - bd.len = __cpu_to_le32(job->job_cb_size); - bd.ptr = __cpu_to_le64((u64) (uintptr_t) job->user_cb); + bd.len = cpu_to_le32(job->job_cb_size); + bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb); - pi = (u64 *) (uintptr_t) (q->kernel_address + + pi = (__le64 *) (uintptr_t) (q->kernel_address + ((q->pi & (q->int_queue_len - 1)) * sizeof(bd))); - pi[0] = pbd[0]; - pi[1] = pbd[1]; - q->pi++; q->pi &= ((q->int_queue_len << 1) - 1); - /* Flush PQ entry write. Relevant only for specific ASICs */ - hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]); + hdev->asic_funcs->pqe_write(hdev, pi, &bd); hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); } From b421d83a3947369fd5718824aecfaebe1efbf7ed Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Wed, 7 Aug 2019 13:54:54 +0000 Subject: [PATCH 6/6] habanalabs: fix device IRQ unmasking for BE host When unmasking IRQs inside the ASIC, the driver passes an array of all the IRQ to unmask. The ASIC's CPU is working in LE so when running in a BE host, the driver needs to do the proper endianness swapping when preparing this array. In addition, this patch also fixes the endianness of a couple of kernel log debug messages that print values of packets Signed-off-by: Ben Segal Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 33 +++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index b39b9c98fe1d..271c5c8f53b4 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3314,9 +3314,11 @@ static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev, int rc; dev_dbg(hdev->dev, "DMA packet details:\n"); - dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); - dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); - dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + dev_dbg(hdev->dev, "source == 0x%llx\n", + le64_to_cpu(user_dma_pkt->src_addr)); + dev_dbg(hdev->dev, "destination == 0x%llx\n", + le64_to_cpu(user_dma_pkt->dst_addr)); + dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize)); ctl = le32_to_cpu(user_dma_pkt->ctl); user_dir = (ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> @@ -3345,9 +3347,11 @@ static int goya_validate_dma_pkt_mmu(struct hl_device *hdev, struct packet_lin_dma *user_dma_pkt) { dev_dbg(hdev->dev, "DMA packet details:\n"); - dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); - dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); - dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + dev_dbg(hdev->dev, "source == 0x%llx\n", + le64_to_cpu(user_dma_pkt->src_addr)); + dev_dbg(hdev->dev, "destination == 0x%llx\n", + le64_to_cpu(user_dma_pkt->dst_addr)); + dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize)); /* * WA for HW-23. @@ -3387,7 +3391,8 @@ static int goya_validate_wreg32(struct hl_device *hdev, dev_dbg(hdev->dev, "WREG32 packet details:\n"); dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset); - dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value); + dev_dbg(hdev->dev, "value == 0x%x\n", + le32_to_cpu(wreg_pkt->value)); if (reg_offset != (mmDMA_CH_0_WR_COMP_ADDR_LO & 0x1FFF)) { dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n", @@ -4359,6 +4364,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, size_t total_pkt_size; long result; int rc; + int irq_num_entries, irq_arr_index; + __le32 *goya_irq_arr; total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) + irq_arr_size; @@ -4376,8 +4383,16 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, if (!pkt) return -ENOMEM; - pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0])); - memcpy(&pkt->irqs, irq_arr, irq_arr_size); + irq_num_entries = irq_arr_size / sizeof(irq_arr[0]); + pkt->length = cpu_to_le32(irq_num_entries); + + /* We must perform any necessary endianness conversation on the irq + * array being passed to the goya hardware + */ + for (irq_arr_index = 0, goya_irq_arr = (__le32 *) &pkt->irqs; + irq_arr_index < irq_num_entries ; irq_arr_index++) + goya_irq_arr[irq_arr_index] = + cpu_to_le32(irq_arr[irq_arr_index]); pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY << ARMCP_PKT_CTL_OPCODE_SHIFT);