From 8d310ba845827a38fcd463d86bfe3b730ce7ab8f Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 27 May 2024 23:11:36 -0700 Subject: [PATCH 01/65] RDMA/bnxt_re: Allow MSN table capability check FW reports the HW capability to use PSN table or MSN table and driver/library need to select it based on this capability. Use the new capability instead of the older capability check for HW retransmission while handling the MSN/PSN table. FW report zero (PSN table) for older adapters to maintain backward compatibility. Also, Updated the FW interface structures to handle the new fields. Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1716876697-25970-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 12 ++++----- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 2 +- drivers/infiniband/hw/bnxt_re/qplib_res.h | 6 +++++ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_sp.h | 1 + drivers/infiniband/hw/bnxt_re/roce_hsi.h | 30 ++++++++++++++++++++++- 6 files changed, 44 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 04258676d072..49e4a4a50bfa 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -984,7 +984,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u16 nsge; if (res->dattr) - qp->dev_cap_flags = res->dattr->dev_cap_flags; + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, @@ -1002,7 +1002,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { psn_sz = sizeof(struct sq_msn_search); qp->msn = 0; } @@ -1016,7 +1016,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) : 0; /* Update msn tbl size */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags) && psn_sz) { + if (qp->is_host_msn_tbl && psn_sz) { hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; @@ -1637,7 +1637,7 @@ static void bnxt_qplib_fill_psn_search(struct bnxt_qplib_qp *qp, if (!swq->psn_search) return; /* Handle MSN differently on cap flags */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { bnxt_qplib_fill_msn_search(qp, wqe, swq); return; } @@ -1819,7 +1819,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, } swq = bnxt_qplib_get_swqe(sq, &wqe_idx); - bnxt_qplib_pull_psn_buff(qp, sq, swq, BNXT_RE_HW_RETX(qp->dev_cap_flags)); + bnxt_qplib_pull_psn_buff(qp, sq, swq, qp->is_host_msn_tbl); idx = 0; swq->slot_idx = hwq->prod; @@ -2009,7 +2009,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, rc = -EINVAL; goto done; } - if (!BNXT_RE_HW_RETX(qp->dev_cap_flags) || msn_update) { + if (!qp->is_host_msn_tbl || msn_update) { swq->next_psn = sq->psn & BTH_PSN_MASK; bnxt_qplib_fill_psn_search(qp, wqe, swq); } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 7fd4506b3584..4aaac84c1b1b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -340,7 +340,7 @@ struct bnxt_qplib_qp { struct list_head rq_flush; u32 msn; u32 msn_tbl_sz; - u16 dev_cap_flags; + bool is_host_msn_tbl; }; #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE sizeof(struct cq_base) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 61628f7f1253..a0f78cde314f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -554,6 +554,12 @@ static inline bool _is_hw_retx_supported(u16 dev_cap_flags) #define BNXT_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_host_msn_table(u16 dev_cap_ext_flags2) +{ + return (dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK) == + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE; +} + static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) { return cctx->modes.dbr_pacing; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 8beeedd15061..9328db92fa6d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -156,6 +156,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); attr->max_sgid = BNXT_QPLIB_NUM_GIDS_SUPPORTED; attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); bnxt_qplib_query_version(rcfw, attr->fw_ver); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index d33c78b96217..16a67d70a6fc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -72,6 +72,7 @@ struct bnxt_qplib_dev_attr { u8 tqm_alloc_reqs[MAX_TQM_ALLOC_REQ]; bool is_atomic; u16 dev_cap_flags; + u16 dev_cap_flags2; u32 max_dpi; }; diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 605c9463c408..042530969505 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2157,8 +2157,36 @@ struct creq_query_func_resp_sb { __le32 tqm_alloc_reqs[12]; __le32 max_dpi; u8 max_sge_var_wqe; - u8 reserved_8; + u8 dev_cap_ext_flags; + #define CREQ_QUERY_FUNC_RESP_SB_ATOMIC_OPS_NOT_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_DRV_VERSION_RGTR_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_QP_BATCH_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_DESTROY_QP_BATCH_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_ROCE_STATS_EXT_CTX_SUPPORTED 0x10UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_SRQ_SGE_SUPPORTED 0x20UL + #define CREQ_QUERY_FUNC_RESP_SB_FIXED_SIZE_WQE_DISABLED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_DCN_SUPPORTED 0x80UL __le16 max_inline_data_var_wqe; + __le32 start_qid; + u8 max_msn_table_size; + u8 reserved8_1; + __le16 dev_cap_ext_flags_2; + #define CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_CHANGE_UDP_SRC_PORT_WQE_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK 0x30UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_SFT 4 + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_PSN_TABLE (0x0UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE (0x1UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + __le16 max_xp_qp_size; + __le16 create_qp_batch_size; + __le16 destroy_qp_batch_size; + __le16 reserved16; + __le64 reserved64; }; /* cmdq_set_func_resources (size:448b/56B) */ From 6f6bfbc595fbae95f4c1ff80c87d089604e5e6a1 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 27 May 2024 23:11:37 -0700 Subject: [PATCH 02/65] RDMA/bnxt_re: Expose the MSN table capability for user library BNXT_RE_COMP_MASK_UCNTX_HW_RETX_ENABLED was introduced to share the HW retransmit capability between driver and lib. The main difference in implementation for HW Retransmit support is the usage of MSN table or PSN table . When HW retrans is enabled, HW expects MSN table to be allocated by driver/lib, else PSN table (for older adapters). FW expose a new field which gives MSN capability. Drivers and libs can depend on the new field instead of HW Retrasns capability. For adapters which support HW_RETX feature, MSN table capability will be set. For older adapters, this value will be 0(to maintain backward compatibility with older FW). Rename UAPI just to capture the correct name of the HW capability that driver/library is interested in. No functional impact even if older rdma-core is used. Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1716876697-25970-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +++ include/uapi/rdma/bnxt_re-abi.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ce9c5bae83bf..d261b09025ca 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4201,6 +4201,9 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (rdev->pacing.dbr_pacing) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED; + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; + if (udata->inlen >= sizeof(ureq)) { rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq))); if (rc) diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index c0c34aca90ec..e61104f35d73 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -55,7 +55,7 @@ enum { BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED = 0x04ULL, BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL, BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL, - BNXT_RE_COMP_MASK_UCNTX_HW_RETX_ENABLED = 0x40, + BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED = 0x40, }; enum bnxt_re_wqe_mode { From 435cdbe9f7a879209807a7519512f94f0b4367c3 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 13 May 2024 08:10:19 +0000 Subject: [PATCH 03/65] RDMA/efa: Fail probe on missing BARs In case any of PCI BARs is missing during device probe we would like to fail as early as possible. Fail if any of the required BARs isn't listed as a memory BAR. Reviewed-by: Daniel Kranzdorf Reviewed-by: Firas Jahjah Signed-off-by: Michael Margolin Link: https://lore.kernel.org/r/20240513081019.26998-1-mrgolin@amazon.com Reviewed-by: Gal Pressman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_main.c | 30 ++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index d1a48f988f6c..99f9ac23c721 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -190,15 +190,23 @@ static int efa_request_doorbell_bar(struct efa_dev *dev) { u8 db_bar_idx = dev->dev_attr.db_bar; struct pci_dev *pdev = dev->pdev; - int bars; + int pci_mem_bars; + int db_bar; int err; - if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) { - bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx); + db_bar = BIT(db_bar_idx); + if (!(db_bar & EFA_BASE_BAR_MASK)) { + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (db_bar & ~pci_mem_bars) { + dev_err(&pdev->dev, + "Doorbells BAR unavailable. Requested %#x, available %#x\n", + db_bar, pci_mem_bars); + return -ENODEV; + } - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + err = pci_request_selected_regions(pdev, db_bar, DRV_MODULE_NAME); if (err) { - dev_err(&dev->pdev->dev, + dev_err(&pdev->dev, "pci_request_selected_regions for bar %d failed %d\n", db_bar_idx, err); return err; @@ -531,7 +539,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) { struct efa_com_dev *edev; struct efa_dev *dev; - int bars; + int pci_mem_bars; int err; err = pci_enable_device_mem(pdev); @@ -556,8 +564,14 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) dev->pdev = pdev; xa_init(&dev->cqs_xa); - bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK; - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (EFA_BASE_BAR_MASK & ~pci_mem_bars) { + dev_err(&pdev->dev, "BARs unavailable. Requested %#x, available %#x\n", + (int)EFA_BASE_BAR_MASK, pci_mem_bars); + err = -ENODEV; + goto err_ibdev_destroy; + } + err = pci_request_selected_regions(pdev, EFA_BASE_BAR_MASK, DRV_MODULE_NAME); if (err) { dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", err); From 38c02d813aa321c0e79b9887da6a2e2b57a75698 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 19 May 2024 09:02:15 +0200 Subject: [PATCH 04/65] RDMA/irdma: Annotate flexible array with __counted_by() in struct irdma_qvlist_info 'num_vectors' is used to count the number of elements in the 'qv_info' flexible array in "struct irdma_qvlist_info". So annotate it with __counted_by() to make it explicit and enable some additional checks. This allocation is done in irdma_save_msix_info(). Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/2ca8b14adf79c4795d7aa95bbfc79253a6bfed82.1716102112.git.christophe.jaillet@wanadoo.fr Acked-by: Shiraz Saleem Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index b65bc2ea542f..9f0ed6e84471 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -239,7 +239,7 @@ struct irdma_qv_info { struct irdma_qvlist_info { u32 num_vectors; - struct irdma_qv_info qv_info[]; + struct irdma_qv_info qv_info[] __counted_by(num_vectors); }; struct irdma_gen_ops { From 53657a0419ef443f9d17ee035ca9980572736d0a Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 22 May 2024 01:24:00 -0700 Subject: [PATCH 05/65] RDMA/mana_ib: Create and destroy RC QP Implement HW requests to create and destroy an RC QP. An RC QP may have 5 queues. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1716366242-558-2-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 59 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 58 ++++++++++++++++++++++++++- 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 2a411357640e..6bd60729ea32 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -888,3 +888,62 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) return 0; } + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_qp_resp resp = {}; + struct mana_rnic_create_qp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->rc_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.flags = flags; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp err %d", err); + return err; + } + qp->qp_handle = resp.rc_qp_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) { + qp->rc_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_rc_qp_resp resp = {0}; + struct mana_rnic_destroy_rc_qp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.rc_qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy rc qp err %d", err); + return err; + } + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 68c3b4f0faa4..a3e229c83c5c 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -95,11 +95,27 @@ struct mana_ib_cq { mana_handle_t cq_handle; }; +enum mana_rc_queue_type { + MANA_RC_SEND_QUEUE_REQUESTER = 0, + MANA_RC_SEND_QUEUE_RESPONDER, + MANA_RC_SEND_QUEUE_FMR, + MANA_RC_RECV_QUEUE_REQUESTER, + MANA_RC_RECV_QUEUE_RESPONDER, + MANA_RC_QUEUE_TYPE_MAX, +}; + +struct mana_ib_rc_qp { + struct mana_ib_queue queues[MANA_RC_QUEUE_TYPE_MAX]; +}; + struct mana_ib_qp { struct ib_qp ibqp; mana_handle_t qp_handle; - struct mana_ib_queue raw_sq; + union { + struct mana_ib_queue raw_sq; + struct mana_ib_rc_qp rc_qp; + }; /* The port on the IB device, starting with 1 */ u32 port; @@ -122,6 +138,8 @@ enum mana_ib_command_code { MANA_IB_CONFIG_MAC_ADDR = 0x30005, MANA_IB_CREATE_CQ = 0x30008, MANA_IB_DESTROY_CQ = 0x30009, + MANA_IB_CREATE_RC_QP = 0x3000a, + MANA_IB_DESTROY_RC_QP = 0x3000b, }; struct mana_ib_query_adapter_caps_req { @@ -230,6 +248,40 @@ struct mana_rnic_destroy_cq_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_rnic_create_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_RC_QUEUE_TYPE_MAX]; + u64 deprecated[2]; + u64 flags; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_create_qp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t rc_qp_handle; + u32 queue_ids[MANA_RC_QUEUE_TYPE_MAX]; + u32 reserved; +}; /* HW Data*/ + +struct mana_rnic_destroy_rc_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t rc_qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_rc_qp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -354,4 +406,8 @@ int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell); int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags); +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); #endif From fdefb918496235a11d6c5477c34c81aab2c1343b Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 22 May 2024 01:24:01 -0700 Subject: [PATCH 06/65] RDMA/mana_ib: Implement uapi to create and destroy RC QP Implement user requests to create and destroy an RC QP. As the user does not have an FMR queue, it is skipped and NO_FMR flag is used. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1716366242-558-3-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mana_ib.h | 4 ++ drivers/infiniband/hw/mana/qp.c | 94 +++++++++++++++++++++++++++- include/uapi/rdma/mana-abi.h | 9 +++ 3 files changed, 105 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index a3e229c83c5c..5cccbe397b6d 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -248,6 +248,10 @@ struct mana_rnic_destroy_cq_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +enum mana_rnic_create_rc_flags { + MANA_RC_FLAG_NO_FMR = 2, +}; + struct mana_rnic_create_qp_req { struct gdma_req_hdr hdr; mana_handle_t adapter; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index ba13c5abf8ef..7dbeba41b978 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -398,6 +398,78 @@ err_free_vport: return err; } +static int mana_ib_create_rc_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_create_rc_qp_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; + struct mana_ib_create_rc_qp ucmd = {}; + int i, err, j; + u64 flags = 0; + u32 doorbell; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); + doorbell = mana_ucontext->doorbell; + flags = MANA_RC_FLAG_NO_FMR; + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy from udata, %d\n", err); + return err; + } + + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + /* skip FMR for user-level RC QPs */ + if (i == MANA_RC_SEND_QUEUE_FMR) { + qp->rc_qp.queues[i].id = INVALID_QUEUE_ID; + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + continue; + } + err = mana_ib_create_queue(mdev, ucmd.queue_buf[j], ucmd.queue_size[j], + &qp->rc_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", i, err); + goto destroy_queues; + } + j++; + } + + err = mana_ib_gd_create_rc_qp(mdev, qp, attr, doorbell, flags); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp %d\n", err); + goto destroy_queues; + } + qp->ibqp.qp_num = qp->rc_qp.queues[MANA_RC_RECV_QUEUE_RESPONDER].id; + qp->port = attr->port_num; + + if (udata) { + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + if (i == MANA_RC_SEND_QUEUE_FMR) + continue; + resp.queue_id[j] = qp->rc_qp.queues[i].id; + j++; + } + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto destroy_qp; + } + } + + return 0; + +destroy_qp: + mana_ib_gd_destroy_rc_qp(mdev, qp); +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); + return err; +} + int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -409,8 +481,9 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, udata); return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + case IB_QPT_RC: + return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata); default: - /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", attr->qp_type); } @@ -473,6 +546,22 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) return 0; } +static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_rc_qp(mdev, qp); + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); + + return 0; +} + int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); @@ -484,7 +573,8 @@ int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) udata); return mana_ib_destroy_qp_raw(qp, udata); - + case IB_QPT_RC: + return mana_ib_destroy_rc_qp(qp, udata); default: ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", ibqp->qp_type); diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h index 2c41cc315218..45c2df619f07 100644 --- a/include/uapi/rdma/mana-abi.h +++ b/include/uapi/rdma/mana-abi.h @@ -45,6 +45,15 @@ struct mana_ib_create_qp_resp { __u32 reserved; }; +struct mana_ib_create_rc_qp { + __aligned_u64 queue_buf[4]; + __u32 queue_size[4]; +}; + +struct mana_ib_create_rc_qp_resp { + __u32 queue_id[4]; +}; + struct mana_ib_create_wq { __aligned_u64 wq_buf_addr; __u32 wq_buf_size; From e095405b45bbbdcf521c63b4207071e5b32df671 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 22 May 2024 01:24:02 -0700 Subject: [PATCH 07/65] RDMA/mana_ib: Modify QP state Implement modify QP state for RC QPs. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1716366242-558-4-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mana_ib.h | 37 ++++++++++++++ drivers/infiniband/hw/mana/qp.c | 72 +++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 5cccbe397b6d..d29dee7b5c7f 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -140,6 +140,7 @@ enum mana_ib_command_code { MANA_IB_DESTROY_CQ = 0x30009, MANA_IB_CREATE_RC_QP = 0x3000a, MANA_IB_DESTROY_RC_QP = 0x3000b, + MANA_IB_SET_QP_STATE = 0x3000d, }; struct mana_ib_query_adapter_caps_req { @@ -286,6 +287,42 @@ struct mana_rnic_destroy_rc_qp_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_ib_ah_attr { + u8 src_addr[16]; + u8 dest_addr[16]; + u8 src_mac[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 src_addr_type; + u8 dest_addr_type; + u8 hop_limit; + u8 traffic_class; + u16 src_port; + u16 dest_port; + u32 reserved; +}; + +struct mana_rnic_set_qp_state_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; + u64 attr_mask; + u32 qp_state; + u32 path_mtu; + u32 rq_psn; + u32 sq_psn; + u32 dest_qpn; + u32 max_dest_rd_atomic; + u32 retry_cnt; + u32 rnr_retry; + u32 min_rnr_timer; + u32 reserved; + struct mana_ib_ah_attr ah_attr; +}; /* HW Data */ + +struct mana_rnic_set_qp_state_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 7dbeba41b978..34a93729db52 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -491,11 +491,79 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, return -EINVAL; } +static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibqp->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_rnic_set_qp_state_resp resp = {}; + struct mana_rnic_set_qp_state_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_port_context *mpc; + struct net_device *ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + req.qp_state = attr->qp_state; + req.attr_mask = attr_mask; + req.path_mtu = attr->path_mtu; + req.rq_psn = attr->rq_psn; + req.sq_psn = attr->sq_psn; + req.dest_qpn = attr->dest_qp_num; + req.max_dest_rd_atomic = attr->max_dest_rd_atomic; + req.retry_cnt = attr->retry_cnt; + req.rnr_retry = attr->rnr_retry; + req.min_rnr_timer = attr->min_rnr_timer; + if (attr_mask & IB_QP_AV) { + ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port); + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + ibqp->port, ibqp->qp_num); + return -EINVAL; + } + mpc = netdev_priv(ndev); + copy_in_reverse(req.ah_attr.src_mac, mpc->mac_addr, ETH_ALEN); + copy_in_reverse(req.ah_attr.dest_mac, attr->ah_attr.roce.dmac, ETH_ALEN); + copy_in_reverse(req.ah_attr.src_addr, attr->ah_attr.grh.sgid_attr->gid.raw, + sizeof(union ib_gid)); + copy_in_reverse(req.ah_attr.dest_addr, attr->ah_attr.grh.dgid.raw, + sizeof(union ib_gid)); + if (rdma_gid_attr_network_type(attr->ah_attr.grh.sgid_attr) == RDMA_NETWORK_IPV4) { + req.ah_attr.src_addr_type = SGID_TYPE_IPV4; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV4; + } else { + req.ah_attr.src_addr_type = SGID_TYPE_IPV6; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV6; + } + req.ah_attr.dest_port = ROCE_V2_UDP_DPORT; + req.ah_attr.src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + ibqp->qp_num, attr->dest_qp_num); + req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class; + req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed modify qp err %d", err); + return err; + } + + return 0; +} + int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - /* modify_qp is not supported by this version of the driver */ - return -EOPNOTSUPP; + switch (ibqp->qp_type) { + case IB_QPT_RC: + return mana_ib_gd_modify_qp(ibqp, attr, attr_mask, udata); + default: + ibdev_dbg(ibqp->device, "Modify QP type %u not supported", ibqp->qp_type); + return -EOPNOTSUPP; + } } static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, From 2d0e7ba468eae365f3c4bc9266679e1f8dd405f0 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 13 May 2024 06:46:30 +0000 Subject: [PATCH 08/65] RDMA/efa: Properly handle unexpected AQ completions Do not try to handle admin command completion if it has an unexpected command id and print a relevant error message. Reviewed-by: Firas Jahjah Reviewed-by: Yehuda Yitschak Signed-off-by: Michael Margolin Link: https://lore.kernel.org/r/20240513064630.6247-1-mrgolin@amazon.com Reviewed-by: Gal Pressman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_com.c | 30 ++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 16a24a05fc2a..bafd210dd43e 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_com.h" @@ -406,8 +406,8 @@ static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue return comp_ctx; } -static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, - struct efa_admin_acq_entry *cqe) +static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, + struct efa_admin_acq_entry *cqe) { struct efa_comp_ctx *comp_ctx; u16 cmd_id; @@ -416,11 +416,11 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); - if (!comp_ctx) { + if (comp_ctx->status != EFA_CMD_SUBMITTED) { ibdev_err(aq->efa_dev, - "comp_ctx is NULL. Changing the admin queue running state\n"); - clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); - return; + "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", + cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); + return -EINVAL; } comp_ctx->status = EFA_CMD_COMPLETED; @@ -428,14 +428,17 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) complete(&comp_ctx->wait_event); + + return 0; } static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) { struct efa_admin_acq_entry *cqe; u16 queue_size_mask; - u16 comp_num = 0; + u16 comp_cmds = 0; u8 phase; + int err; u16 ci; queue_size_mask = aq->depth - 1; @@ -453,10 +456,12 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) * phase bit was validated */ dma_rmb(); - efa_com_handle_single_admin_completion(aq, cqe); + err = efa_com_handle_single_admin_completion(aq, cqe); + if (!err) + comp_cmds++; + aq->cq.cc++; ci++; - comp_num++; if (ci == aq->depth) { ci = 0; phase = !phase; @@ -465,10 +470,9 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) cqe = &aq->cq.entries[ci]; } - aq->cq.cc += comp_num; aq->cq.phase = phase; - aq->sq.cc += comp_num; - atomic64_add(comp_num, &aq->stats.completed_cmd); + aq->sq.cc += comp_cmds; + atomic64_add(comp_cmds, &aq->stats.completed_cmd); } static int efa_com_comp_status_to_errno(u8 comp_status) From 65357e2c164a08bf20849dd55f46aa71e00334fa Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 30 May 2024 04:55:16 -0700 Subject: [PATCH 09/65] RDMA/mana_ib: set node_guid Use the mac address for the node_guid of the IB device. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1717070117-1234-2-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 7e09ceb3da53..9a7da2ec9cdb 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -5,6 +5,7 @@ #include "mana_ib.h" #include +#include MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); MODULE_LICENSE("GPL"); @@ -92,6 +93,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto free_ib_device; } ether_addr_copy(mac_addr, upper_ndev->dev_addr); + addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, upper_ndev->dev_addr); ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1); rcu_read_unlock(); if (ret) { From c8683b995d8aba9d5b4e2368fedad83508882d84 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 30 May 2024 04:55:17 -0700 Subject: [PATCH 10/65] RDMA/mana_ib: extend query device Fill in properties of the ib device. Order the assignment in the order of fields in the struct ib_device_attr. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1717070117-1234-3-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 19 ++++++++++++++++--- drivers/infiniband/hw/mana/mana_ib.h | 5 +++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 6bd60729ea32..365b4f174a83 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -547,14 +547,27 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + memset(props, 0, sizeof(*props)); + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->page_size_cap = PAGE_SZ_BM; props->max_qp = dev->adapter_caps.max_qp_count; props->max_qp_wr = dev->adapter_caps.max_qp_wr; + props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN; + props->max_send_sge = dev->adapter_caps.max_send_sge_count; + props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_sge_rd = dev->adapter_caps.max_recv_sge_count; props->max_cq = dev->adapter_caps.max_cq_count; props->max_cqe = dev->adapter_caps.max_qp_wr; props->max_mr = dev->adapter_caps.max_mr_count; - props->max_mr_size = MANA_IB_MAX_MR_SIZE; - props->max_send_sge = dev->adapter_caps.max_send_sge_count; - props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_pd = dev->adapter_caps.max_pd_count; + props->max_qp_rd_atom = dev->adapter_caps.max_inbound_read_limit; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_qp_init_rd_atom = dev->adapter_caps.max_outbound_read_limit; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_ah = INT_MAX; + props->max_pkeys = 1; + props->local_ca_ack_delay = MANA_CA_ACK_DELAY; return 0; } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index d29dee7b5c7f..60bc548ba4be 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -27,6 +27,11 @@ */ #define MANA_IB_MAX_MR 0xFFFFFFu +/* + * The CA timeout is approx. 260ms (4us * 2^(DELAY)) + */ +#define MANA_CA_ACK_DELAY 16 + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; From b1bc15f8fb5f20b4199fbf10868bd416d9a6a692 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Jun 2024 08:50:57 -0600 Subject: [PATCH 11/65] RDMA/iwcm: Use list_first_entry() where appropriate Improve source code readability by using list_first_entry() where appropriate. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240605145117.397751-2-bvanassche@acm.org Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 0301fcad4b48..90d8f3d66990 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -143,8 +143,8 @@ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) if (list_empty(&cm_id_priv->work_free_list)) return NULL; - work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, - free_list); + work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work, + free_list); list_del_init(&work->free_list); return work; } @@ -1023,8 +1023,8 @@ static void cm_work_handler(struct work_struct *_work) spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); while (!empty) { - work = list_entry(cm_id_priv->work_list.next, - struct iwcm_work, list); + work = list_first_entry(&cm_id_priv->work_list, + struct iwcm_work, list); list_del_init(&work->list); empty = list_empty(&cm_id_priv->work_list); levent = work->event; From fc772e38bce5635bc1d7efae7198a305504ad112 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Jun 2024 08:50:58 -0600 Subject: [PATCH 12/65] RDMA/iwcm: Change the return type of iwcm_deref_id() Since iwcm_deref_id() returns either 0 or 1, change its return type from 'int' into 'bool'. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240605145117.397751-3-bvanassche@acm.org Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 90d8f3d66990..ae9c12409f8a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -206,17 +206,17 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) /* * Release a reference on cm_id. If the last reference is being - * released, free the cm_id and return 1. + * released, free the cm_id and return 'true'. */ -static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); - return 1; + return true; } - return 0; + return false; } static void add_ref(struct iw_cm_id *cm_id) From e1168f09b3314992f1c5251f3793102035da7237 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Jun 2024 08:50:59 -0600 Subject: [PATCH 13/65] RDMA/iwcm: Simplify cm_event_handler() queue_work() can test efficiently whether or not work is pending. Hence, simplify cm_event_handler() by always calling queue_work() instead of only if the list with pending work is empty. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240605145117.397751-4-bvanassche@acm.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index ae9c12409f8a..3d66aec36899 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1093,11 +1093,8 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } refcount_inc(&cm_id_priv->refcount); - if (list_empty(&cm_id_priv->work_list)) { - list_add_tail(&work->list, &cm_id_priv->work_list); - queue_work(iwcm_wq, &work->work); - } else - list_add_tail(&work->list, &cm_id_priv->work_list); + list_add_tail(&work->list, &cm_id_priv->work_list); + queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; From a1babdb5b615751ef5ace97a37f35d0ae40fbf13 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Jun 2024 08:51:00 -0600 Subject: [PATCH 14/65] RDMA/iwcm: Simplify cm_work_handler() Instead of complicating the code to avoid a spin_lock_irqsave() / spin_lock_irqrestore() pair before returning, simplify the code by removing the local variable 'empty'. Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240605145117.397751-5-bvanassche@acm.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 3d66aec36899..4424f430fc08 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1017,16 +1017,13 @@ static void cm_work_handler(struct work_struct *_work) struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; - int empty; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - empty = list_empty(&cm_id_priv->work_list); - while (!empty) { + while (!list_empty(&cm_id_priv->work_list)) { work = list_first_entry(&cm_id_priv->work_list, struct iwcm_work, list); list_del_init(&work->list); - empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1039,8 +1036,6 @@ static void cm_work_handler(struct work_struct *_work) pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) return; - if (empty) - return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); From aee2424246f9f1dadc33faa78990c1e2eb7826e4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Jun 2024 08:51:01 -0600 Subject: [PATCH 15/65] RDMA/iwcm: Fix a use-after-free related to destroying CM IDs iw_conn_req_handler() associates a new struct rdma_id_private (conn_id) with an existing struct iw_cm_id (cm_id) as follows: conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; rdma_destroy_id() frees both the cm_id and the struct rdma_id_private. Make sure that cm_work_handler() does not trigger a use-after-free by only freeing of the struct rdma_id_private after all pending work has finished. Cc: stable@vger.kernel.org Fixes: 59c68ac31e15 ("iw_cm: free cm_id resources on the last deref") Reviewed-by: Zhu Yanjun Tested-by: Shin'ichiro Kawasaki Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240605145117.397751-6-bvanassche@acm.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwcm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 4424f430fc08..1a6339f3a63f 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -368,8 +368,10 @@ EXPORT_SYMBOL(iw_cm_disconnect); * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. + * + * Returns true if and only if the last cm_id_priv reference has been dropped. */ -static void destroy_cm_id(struct iw_cm_id *cm_id) +static bool destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; @@ -439,7 +441,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } - (void)iwcm_deref_id(cm_id_priv); + return iwcm_deref_id(cm_id_priv); } /* @@ -450,7 +452,8 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { - destroy_cm_id(cm_id); + if (!destroy_cm_id(cm_id)) + flush_workqueue(iwcm_wq); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -1031,7 +1034,7 @@ static void cm_work_handler(struct work_struct *_work) if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); if (ret) - destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id)); } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) From 2a1251e3dbb2995100b6f351c2452228895386a5 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 7 Jun 2024 03:08:17 -0700 Subject: [PATCH 16/65] RDMA/mana_ib: Process QP error events in mana_ib Process QP fatal events from the error event queue. For that, find the QP, using QPN from the event, and then call its event_handler. To find the QPs, store created RC QPs in an xarray. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1717754897-19858-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Wei Hu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 3 ++ drivers/infiniband/hw/mana/main.c | 31 +++++++++++++++++-- drivers/infiniband/hw/mana/mana_ib.h | 24 ++++++++++++++ drivers/infiniband/hw/mana/qp.c | 22 +++++++++++++ .../net/ethernet/microsoft/mana/gdma_main.c | 1 + include/net/mana/gdma.h | 1 + 6 files changed, 80 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 9a7da2ec9cdb..b07a8e2e838f 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -126,6 +126,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, if (ret) goto destroy_eqs; + xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ); ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); if (ret) { ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", @@ -143,6 +144,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, return 0; destroy_rnic: + xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: mana_ib_destroy_eqs(dev); @@ -158,6 +160,7 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); + xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); mana_gd_deregister_device(dev->gdma_dev); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 365b4f174a83..d13abc954d2a 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -667,6 +667,33 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) return 0; } +static void +mana_ib_event_handler(void *ctx, struct gdma_queue *q, struct gdma_event *event) +{ + struct mana_ib_dev *mdev = (struct mana_ib_dev *)ctx; + struct mana_ib_qp *qp; + struct ib_event ev; + u32 qpn; + + switch (event->type) { + case GDMA_EQE_RNIC_QP_FATAL: + qpn = event->details[0]; + qp = mana_get_qp_ref(mdev, qpn); + if (!qp) + break; + if (qp->ibqp.event_handler) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + mana_put_qp_ref(qp); + break; + default: + break; + } +} + int mana_ib_create_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); @@ -676,7 +703,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) spec.type = GDMA_EQ; spec.monitor_avl_buf = false; spec.queue_size = EQ_SIZE; - spec.eq.callback = NULL; + spec.eq.callback = mana_ib_event_handler; spec.eq.context = mdev; spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; spec.eq.msix_index = 0; @@ -691,7 +718,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) err = -ENOMEM; goto destroy_fatal_eq; } - + spec.eq.callback = NULL; for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 60bc548ba4be..977da9569701 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -62,6 +62,7 @@ struct mana_ib_dev { mana_handle_t adapter_handle; struct gdma_queue *fatal_err_eq; struct gdma_queue **eqs; + struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; }; @@ -124,6 +125,9 @@ struct mana_ib_qp { /* The port on the IB device, starting with 1 */ u32 port; + + refcount_t refcount; + struct completion free; }; struct mana_ib_ucontext { @@ -333,6 +337,26 @@ static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) return mdev->gdma_dev->gdma_context; } +static inline struct mana_ib_qp *mana_get_qp_ref(struct mana_ib_dev *mdev, + uint32_t qid) +{ + struct mana_ib_qp *qp; + unsigned long flag; + + xa_lock_irqsave(&mdev->qp_table_wq, flag); + qp = xa_load(&mdev->qp_table_wq, qid); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&mdev->qp_table_wq, flag); + return qp; +} + +static inline void mana_put_qp_ref(struct mana_ib_qp *qp) +{ + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} + static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port) { struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 34a93729db52..10cf73b569fa 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -398,6 +398,22 @@ err_free_vport: return err; } +static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + refcount_set(&qp->refcount, 1); + init_completion(&qp->free); + return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, + GFP_KERNEL); +} + +static void mana_table_remove_qp(struct mana_ib_dev *mdev, + struct mana_ib_qp *qp) +{ + xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); + mana_put_qp_ref(qp); + wait_for_completion(&qp->free); +} + static int mana_ib_create_rc_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -460,6 +476,10 @@ static int mana_ib_create_rc_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, } } + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + return 0; destroy_qp: @@ -620,6 +640,8 @@ static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); int i; + mana_table_remove_qp(mdev, qp); + /* Ignore return code as there is not much we can do about it. * The error message is printed inside. */ diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 1332db9a08eb..a489297f26a0 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -380,6 +380,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) case GDMA_EQE_HWC_INIT_EQ_ID_DB: case GDMA_EQE_HWC_INIT_DATA: case GDMA_EQE_HWC_INIT_DONE: + case GDMA_EQE_RNIC_QP_FATAL: if (!eq->eq.callback) break; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 27684135bb4d..44c797d5f899 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -60,6 +60,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_DONE = 131, GDMA_EQE_HWC_SOC_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_RNIC_QP_FATAL = 176, }; enum { From 638420115cc4ad6c3a2683bf46a052b505abb202 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Jun 2024 13:26:38 +0300 Subject: [PATCH 17/65] IB/mlx5: Create UMR QP just before first reg_mr occurs UMR QP is not used in some cases, so move QP and its CQ creations from driver load flow to the time first reg_mr occurs, that is when MR interfaces are first called. The initialization of dev->umrc.pd and dev->umrc.lock is still done in driver load because pd is needed for mlx5_mkey_cache_init and the lock is reused to protect against the concurrent creation. When testing 4G bytes memory registration latency with rtool [1] and 8 threads in parallel, there is minor performance degradation (<5% for the max latency) is seen for the first reg_mr with this change. Link: https://github.com/paravmellanox/rtool [1] Signed-off-by: Jianbo Liu Link: https://lore.kernel.org/r/55d3c4f8a542fd974d8a4c5816eccfb318a59b38.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + drivers/infiniband/hw/mlx5/mr.c | 9 +++++ drivers/infiniband/hw/mlx5/umr.c | 55 +++++++++++++++++++++------- drivers/infiniband/hw/mlx5/umr.h | 3 ++ 5 files changed, 58 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 00e292422801..051b93ccd8d5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4086,6 +4086,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { mlx5_mkey_cache_cleanup(dev); mlx5r_umr_resource_cleanup(dev); + mlx5r_umr_cleanup(dev); } static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) @@ -4097,7 +4098,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) { int ret; - ret = mlx5r_umr_resource_init(dev); + ret = mlx5r_umr_init(dev); if (ret) return ret; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b68779e9d86c..1a5e16639df0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -751,6 +751,8 @@ struct umr_common { */ struct mutex lock; unsigned int state; + /* Protects from repeat UMR QP creation */ + struct mutex init_lock; }; #define NUM_MKEYS_PER_PAGE \ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ecc111ed5d86..bc9c3871f715 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1470,6 +1470,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem; + int err; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) return ERR_PTR(-EOPNOTSUPP); @@ -1477,6 +1478,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", start, iova, length, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); + if (access_flags & IB_ACCESS_ON_DEMAND) return create_user_odp_mr(pd, start, length, iova, access_flags, udata); @@ -1523,6 +1528,10 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", offset, virt_addr, length, fd, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); + /* dmabuf requires xlt update via umr to work. */ if (!mlx5r_umr_can_load_pas(dev, length)) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index e76142f6fa88..ffc31b01f690 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -135,22 +135,28 @@ static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr init_attr = {}; - struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - int ret; + int ret = 0; - pd = ib_alloc_pd(&dev->ib_dev, 0); - if (IS_ERR(pd)) { - mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); - return PTR_ERR(pd); - } + + /* + * UMR qp is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (dev->umrc.qp) + return 0; + + mutex_lock(&dev->umrc.init_lock); + /* First user allocates the UMR resources. Skip if already allocated. */ + if (dev->umrc.qp) + goto unlock; cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); - goto destroy_pd; + goto unlock; } init_attr.send_cq = cq; @@ -160,7 +166,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) init_attr.cap.max_send_sge = 1; init_attr.qp_type = MLX5_IB_QPT_REG_UMR; init_attr.port_num = 1; - qp = ib_create_qp(pd, &init_attr); + qp = ib_create_qp(dev->umrc.pd, &init_attr); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); @@ -171,22 +177,22 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) if (ret) goto destroy_qp; - dev->umrc.qp = qp; dev->umrc.cq = cq; - dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); mutex_init(&dev->umrc.lock); dev->umrc.state = MLX5_UMR_STATE_ACTIVE; + dev->umrc.qp = qp; + mutex_unlock(&dev->umrc.init_lock); return 0; destroy_qp: ib_destroy_qp(qp); destroy_cq: ib_free_cq(cq); -destroy_pd: - ib_dealloc_pd(pd); +unlock: + mutex_unlock(&dev->umrc.init_lock); return ret; } @@ -194,8 +200,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) { if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) return; + mutex_destroy(&dev->umrc.lock); + /* After device init, UMR cp/qp are not unset during the lifetime. */ ib_destroy_qp(dev->umrc.qp); ib_free_cq(dev->umrc.cq); +} + +int mlx5r_umr_init(struct mlx5_ib_dev *dev) +{ + struct ib_pd *pd; + + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + return PTR_ERR(pd); + } + dev->umrc.pd = pd; + + mutex_init(&dev->umrc.init_lock); + + return 0; +} + +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) +{ + mutex_destroy(&dev->umrc.init_lock); ib_dealloc_pd(dev->umrc.pd); } diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 3799bb758e49..5f734dc72bef 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -16,6 +16,9 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev); void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev); +int mlx5r_umr_init(struct mlx5_ib_dev *dev); +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev); + static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev, size_t length) { From 5895e70f2e6e8dc67b551ca554d6fcde0a7f0467 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Jun 2024 13:26:39 +0300 Subject: [PATCH 18/65] IB/mlx5: Allocate resources just before first QP/SRQ is created MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, all IB dev resources are initialized on driver load. As they are not always used, move the initialization to the time when they are needed. To be more specific, move PD (p0) and CQ (c0) initialization to the time when the first SRQ is created. and move SRQs(s0 and s1) initialization to the time first QP is created. To avoid concurrent creations, two new mutexes are also added. Signed-off-by: Jianbo Liu Link: https://lore.kernel.org/r/98c3e53a8cc0bdfeb6dec6e5bb8b037d78ab00d8.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 157 +++++++++++++++++++-------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 + drivers/infiniband/hw/mlx5/qp.c | 4 + drivers/infiniband/hw/mlx5/srq.c | 4 + 4 files changed, 122 insertions(+), 47 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 051b93ccd8d5..cee167cf310e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2823,37 +2823,72 @@ static u8 mlx5_get_umr_fence(u8 umr_fence_cap) } } -static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_cq_init_attr cq_attr = {.cqe = 1}; + struct ib_device *ibdev; + struct ib_pd *pd; + struct ib_cq *cq; + int ret = 0; + + + /* + * devr->c0 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->c0) + return 0; + + mutex_lock(&devr->cq_lock); + if (devr->c0) + goto unlock; + + ibdev = &dev->ib_dev; + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + goto unlock; + } + + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + ib_dealloc_pd(pd); + goto unlock; + } + + devr->p0 = pd; + devr->c0 = cq; + +unlock: + mutex_unlock(&devr->cq_lock); + return ret; +} + +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; struct ib_srq_init_attr attr; - struct ib_device *ibdev; - struct ib_cq_init_attr cq_attr = {.cqe = 1}; - int port; + struct ib_srq *s0, *s1; int ret = 0; - ibdev = &dev->ib_dev; + /* + * devr->s1 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->s1) + return 0; - if (!MLX5_CAP_GEN(dev->mdev, xrc)) - return -EOPNOTSUPP; + mutex_lock(&devr->srq_lock); + if (devr->s1) + goto unlock; - devr->p0 = ib_alloc_pd(ibdev, 0); - if (IS_ERR(devr->p0)) - return PTR_ERR(devr->p0); - - devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); - goto error1; - } - - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + ret = mlx5_ib_dev_res_cq_init(dev); if (ret) - goto error2; - - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); - if (ret) - goto error3; + goto unlock; memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; @@ -2861,10 +2896,11 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.srq_type = IB_SRQT_XRC; attr.ext.cq = devr->c0; - devr->s0 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s0)) { - ret = PTR_ERR(devr->s0); - goto err_create; + s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s0)) { + ret = PTR_ERR(s0); + mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + goto unlock; } memset(&attr, 0, sizeof(attr)); @@ -2872,29 +2908,48 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; - devr->s1 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s1)) { - ret = PTR_ERR(devr->s1); - goto error6; + s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s1)) { + ret = PTR_ERR(s1); + mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + ib_destroy_srq(s0); + } + + devr->s0 = s0; + devr->s1 = s1; + +unlock: + mutex_unlock(&devr->srq_lock); + return ret; +} + +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int port; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + return ret; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + if (ret) { + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + return ret; } for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) INIT_WORK(&devr->ports[port].pkey_change_work, pkey_change_handler); - return 0; + mutex_init(&devr->cq_lock); + mutex_init(&devr->srq_lock); -error6: - ib_destroy_srq(devr->s0); -err_create: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); -error3: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); -error2: - ib_destroy_cq(devr->c0); -error1: - ib_dealloc_pd(devr->p0); - return ret; + return 0; } static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) @@ -2911,12 +2966,20 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) cancel_work_sync(&devr->ports[port].pkey_change_work); - ib_destroy_srq(devr->s1); - ib_destroy_srq(devr->s0); + /* After s0/s1 init, they are not unset during the device lifetime. */ + if (devr->s1) { + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); + } mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); - ib_destroy_cq(devr->c0); - ib_dealloc_pd(devr->p0); + /* After p0/c0 init, they are not unset during the device lifetime. */ + if (devr->c0) { + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); + } + mutex_destroy(&devr->cq_lock); + mutex_destroy(&devr->srq_lock); } static u32 get_core_cap_flags(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 1a5e16639df0..a29102caa1b7 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -824,11 +824,13 @@ struct mlx5_ib_port_resources { struct mlx5_ib_resources { struct ib_cq *c0; + struct mutex cq_lock; u32 xrcdn0; u32 xrcdn1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mutex srq_lock; struct mlx5_ib_port_resources ports[2]; }; @@ -1272,6 +1274,8 @@ to_mmmap(struct rdma_user_mmap_entry *rdma_entry) struct mlx5_user_mmap_entry, rdma_entry); } +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev); +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev); int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e8c0fead4062..079ad927182f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3234,6 +3234,10 @@ int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, enum ib_qp_type type; int err; + err = mlx5_ib_dev_res_srq_init(dev); + if (err) + return err; + err = check_qp_type(dev, attr, &type); if (err) return err; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index a056ea835da5..fe907c908d82 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -213,6 +213,10 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, return -EINVAL; } + err = mlx5_ib_dev_res_cq_init(dev); + if (err) + return err; + mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); From a4e540119be565f47c305f295ed43f8e0bc3f5c3 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 13 Jun 2024 21:01:42 +0300 Subject: [PATCH 19/65] RDMA/mlx5: Set mkeys for dmabuf at PAGE_SIZE Set the mkey for dmabuf at PAGE_SIZE to support any SGL after a move operation. ib_umem_find_best_pgsz returns 0 on error, so it is incorrect to check the returned page_size against PAGE_SIZE Fixes: 90da7dc8206a ("RDMA/mlx5: Support dma-buf based userspace memory region") Signed-off-by: Chiara Meiohas Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/1e2289b9133e89f273a4e68d459057d032cbc2ce.1718301631.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 13 +++++++++++++ drivers/infiniband/hw/mlx5/odp.c | 6 ++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a29102caa1b7..74a20df8b292 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -115,6 +115,19 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ page_offset_quantized) +static inline unsigned long +mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf) +{ + /* + * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able + * to hold any sgl after a move operation. Ideally the mkc page size + * could be changed at runtime to be optimal, but right now the driver + * cannot do that. + */ + return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE, + umem_dmabuf->umem.iova); +} + enum { MLX5_IB_MMAP_OFFSET_START = 9, MLX5_IB_MMAP_OFFSET_END = 255, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4a04cbc5b78a..a524181f34df 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -705,10 +705,8 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, return err; } - page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc, - log_page_size, 0, - umem_dmabuf->umem.iova); - if (unlikely(page_size < PAGE_SIZE)) { + page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf); + if (!page_size) { ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { From a92fbeac7e94a420b55570c10fe1b90e64da4025 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2024 15:52:51 +0300 Subject: [PATCH 20/65] RDMA/cache: Release GID table even if leak is detected When the table is released, we nullify pointer to GID table, it means that in case GID entry leak is detected, we will leak table too. Delete code that prevents table destruction. Fixes: b150c3862d21 ("IB/core: Introduce GID entry reference counts") Link: https://lore.kernel.org/r/a62560af06ba82c88ef9194982bfa63d14768ff9.1716900410.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index c02a96d3572a..6791df64a5fe 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -794,7 +794,6 @@ err_free_table: static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { - bool leak = false; int i; if (!table) @@ -803,15 +802,12 @@ static void release_gid_table(struct ib_device *device, for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; - if (kref_read(&table->data_vec[i]->kref) > 1) { - dev_err(&device->dev, - "GID entry ref leak for index %d ref=%u\n", i, - kref_read(&table->data_vec[i]->kref)); - leak = true; - } + + WARN_ONCE(true, + "GID entry ref leak for dev %s index %d ref=%u\n", + dev_name(&device->dev), i, + kref_read(&table->data_vec[i]->kref)); } - if (leak) - return; mutex_destroy(&table->lock); kfree(table->data_vec); From 0d2e6992fc956e3308cd5376c18567def4cb3967 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 16 Jun 2024 19:16:33 +0300 Subject: [PATCH 21/65] RDMA/mlx4: Fix truncated output warning in mad.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase size of the name array to avoid truncated output warning. drivers/infiniband/hw/mlx4/mad.c: In function ‘mlx4_ib_alloc_demux_ctx’: drivers/infiniband/hw/mlx4/mad.c:2197:47: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 4 [-Werror=format-truncation=] 2197 | snprintf(name, sizeof(name), "mlx4_ibt%d", port); | ^~ drivers/infiniband/hw/mlx4/mad.c:2197:38: note: directive argument in the range [-2147483645, 2147483647] 2197 | snprintf(name, sizeof(name), "mlx4_ibt%d", port); | ^~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:2197:9: note: ‘snprintf’ output between 10 and 20 bytes into a destination of size 12 2197 | snprintf(name, sizeof(name), "mlx4_ibt%d", port); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:2205:48: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 3 [-Werror=format-truncation=] 2205 | snprintf(name, sizeof(name), "mlx4_ibwi%d", port); | ^~ drivers/infiniband/hw/mlx4/mad.c:2205:38: note: directive argument in the range [-2147483645, 2147483647] 2205 | snprintf(name, sizeof(name), "mlx4_ibwi%d", port); | ^~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:2205:9: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 12 2205 | snprintf(name, sizeof(name), "mlx4_ibwi%d", port); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:2213:48: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 3 [-Werror=format-truncation=] 2213 | snprintf(name, sizeof(name), "mlx4_ibud%d", port); | ^~ drivers/infiniband/hw/mlx4/mad.c:2213:38: note: directive argument in the range [-2147483645, 2147483647] 2213 | snprintf(name, sizeof(name), "mlx4_ibud%d", port); | ^~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:2213:9: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 12 2213 | snprintf(name, sizeof(name), "mlx4_ibud%d", port); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make[6]: *** [scripts/Makefile.build:244: drivers/infiniband/hw/mlx4/mad.o] Error 1 Fixes: fc06573dfaf8 ("IB/mlx4: Initialize SR-IOV IB support for slaves in master context") Link: https://lore.kernel.org/r/f3798b3ce9a410257d7e1ec7c9e285f1352e256a.1718554569.git.leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index a37cfac5e23f..dc9cf45d2d32 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -2158,7 +2158,7 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { - char name[12]; + char name[21]; int ret = 0; int i; From 5953e0647cec703ef436ead37fed48943507b433 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 16 Jun 2024 19:17:30 +0300 Subject: [PATCH 22/65] RDMA/mlx4: Fix truncated output warning in alias_GUID.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/infiniband/hw/mlx4/alias_GUID.c: In function ‘mlx4_ib_init_alias_guid_service’: drivers/infiniband/hw/mlx4/alias_GUID.c:878:74: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 5 [-Werror=format-truncation=] 878 | snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); | ^~ drivers/infiniband/hw/mlx4/alias_GUID.c:878:63: note: directive argument in the range [-2147483641, 2147483646] 878 | snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); | ^~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/alias_GUID.c:878:17: note: ‘snprintf’ output between 12 and 22 bytes into a destination of size 15 878 | snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Fixes: a0c64a17aba8 ("mlx4: Add alias_guid mechanism") Link: https://lore.kernel.org/r/1951c9500109ca7e36dcd523f8a5f2d0d2a608d1.1718554641.git.leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 111fa88a3be4..9a439569ffcf 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -829,7 +829,7 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { - char alias_wq_name[15]; + char alias_wq_name[22]; int ret = 0; int i, j; union ib_gid gid; From 0c5275bf75ec3708d95654195ae4ed80d946d088 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Sun, 16 Jun 2024 19:10:36 +0300 Subject: [PATCH 23/65] RDMA/mlx5: Use sq timestamp as QP timestamp when RoCE is disabled When creating a QP, one of the attributes is TS format (timestamp). In some devices, we have a limitation that all QPs should have the same ts_format. The ts_format is chosen based on the device's capability. The qp_ts_format cap resides under the RoCE caps table, and the cap will be 0 when RoCE is disabled. So when RoCE is disabled, the value that should be queried is sq_ts_format under HCA caps. Consider the case when the system supports REAL_TIME_TS format (0x2), some QPs are created with REAL_TIME_TS as ts_format, and afterwards RoCE gets disabled. When trying to construct a new QP, we can't use the qp_ts_format, that is queried from the RoCE caps table, Since it leads to passing 0x0 (FREE_RUNNING_TS) as the value of the qp_ts_format, which is different than the ts_format of the previously allocated QPs REAL_TIME_TS format (0x2). Thus, to resolve this, read the sq_ts_format, which also reflect the supported ts format for the QP when RoCE is disabled. Fixes: 4806f1e2fee8 ("net/mlx5: Set QP timestamp mode to default") Signed-off-by: Maher Sanalla Signed-off-by: Or Har-Toov Link: https://lore.kernel.org/r/32801966eb767c7fd62b8dea3b63991d5fbfe213.1718554199.git.leon@kernel.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/qp.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f0e55bf3ec8b..ad1ce650146c 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -576,9 +576,12 @@ static inline const char *mlx5_qp_state_str(int state) static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev) { - return !MLX5_CAP_ROCE(dev, qp_ts_format) ? - MLX5_TIMESTAMP_FORMAT_FREE_RUNNING : - MLX5_TIMESTAMP_FORMAT_DEFAULT; + u8 supported_ts_cap = mlx5_get_roce_state(dev) ? + MLX5_CAP_ROCE(dev, qp_ts_format) : + MLX5_CAP_GEN(dev, sq_ts_format); + + return supported_ts_cap ? MLX5_TIMESTAMP_FORMAT_DEFAULT : + MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; } #endif /* MLX5_QP_H */ From 844bc12e6da3e2d8ab0cd96d049fc695d5d8ba68 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 19 Jun 2024 20:11:52 +0300 Subject: [PATCH 24/65] IB/core: add support for draining Shared receive queues To avoid leakage for QPs assocoated with SRQ, according to IB spec (section 10.3.1): "Note, for QPs that are associated with an SRQ, the Consumer should take the QP through the Error State before invoking a Destroy QP or a Modify QP to the Reset State. The Consumer may invoke the Destroy QP without first performing a Modify QP to the Error State and waiting for the Affiliated Asynchronous Last WQE Reached Event. However, if the Consumer does not wait for the Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment leakage may occur. Therefore, it is good programming practice to teardown a QP that is associated with an SRQ by using the following process: - Put the QP in the Error State; - wait for the Affiliated Asynchronous Last WQE Reached Event; - either: - drain the CQ by invoking the Poll CQ verb and either wait for CQ to be empty or the number of Poll CQ operations has exceeded CQ capacity size; or - post another WR that completes on the same CQ and wait for this WR to return as a WC; - and then invoke a Destroy QP or Reset QP." Catch the Last WQE Reached Event in the core layer during drain QP flow. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20240619171153.34631-2-mgurtovoy@nvidia.com Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 82 ++++++++++++++++++++++++++++++++- include/rdma/ib_verbs.h | 2 + 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 94a7f3b0c71c..473ee0831307 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1101,6 +1101,16 @@ EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ +static void __ib_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = event->element.qp; + + if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) + complete(&qp->srq_completion); + if (qp->registered_event_handler) + qp->registered_event_handler(event, qp->qp_context); +} + static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; @@ -1221,13 +1231,15 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, qp->qp_type = attr->qp_type; qp->rwq_ind_tbl = attr->rwq_ind_tbl; qp->srq = attr->srq; - qp->event_handler = attr->event_handler; + qp->event_handler = __ib_qp_event_handler; + qp->registered_event_handler = attr->event_handler; qp->port = attr->port_num; qp->qp_context = attr->qp_context; spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + init_completion(&qp->srq_completion); qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; @@ -2884,6 +2896,72 @@ static void __ib_drain_rq(struct ib_qp *qp) wait_for_completion(&rdrain.done); } +/* + * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout + * expires. + * @qp: queue pair associated with SRQ to drain + * + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the first option. + */ +static void __ib_drain_srq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_cq *cq; + int n, polled = 0; + int ret; + + if (!qp->srq) { + WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp); + return; + } + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret); + return; + } + + if (ib_srq_has_cq(qp->srq->srq_type)) { + cq = qp->srq->ext.cq; + } else if (qp->recv_cq) { + cq = qp->recv_cq; + } else { + WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp); + return; + } + + if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) { + while (polled != cq->cqe) { + n = ib_process_cq_direct(cq, cq->cqe - polled); + if (!n) + return; + polled += n; + } + } +} + /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. @@ -2962,6 +3040,8 @@ void ib_drain_qp(struct ib_qp *qp) ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); + else + __ib_drain_srq(qp); } EXPORT_SYMBOL(ib_drain_qp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 477bf9dd5e71..5a193008f99c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1788,6 +1788,7 @@ struct ib_qp { struct list_head rdma_mrs; struct list_head sig_mrs; struct ib_srq *srq; + struct completion srq_completion; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; @@ -1797,6 +1798,7 @@ struct ib_qp { struct ib_qp *real_qp; struct ib_uqp_object *uobject; void (*event_handler)(struct ib_event *, void *); + void (*registered_event_handler)(struct ib_event *, void *); void *qp_context; /* sgid_attrs associated with the AV's */ const struct ib_gid_attr *av_sgid_attr; From 58945ddd7156b4d83206b7b21567e67fca16346a Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 19 Jun 2024 20:11:53 +0300 Subject: [PATCH 25/65] IB/isert: remove the handling of last WQE reached event This event is raised for QPs that are associated with a Shared RQ (SRQ). The iSER target does not support SRQ. Remove this dead code. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20240619171153.34631-3-mgurtovoy@nvidia.com Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 00a7303c8cc6..42977a5326ee 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -91,9 +91,6 @@ isert_qp_event_callback(struct ib_event *e, void *context) case IB_EVENT_COMM_EST: rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST); break; - case IB_EVENT_QP_LAST_WQE_REACHED: - isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n"); - break; default: break; } From 4adcaf969d77d3d3aa3871bbadc196258a38aec6 Mon Sep 17 00:00:00 2001 From: Honggang LI Date: Mon, 24 Jun 2024 10:03:48 +0800 Subject: [PATCH 26/65] RDMA/rxe: Don't set BTH_ACK_MASK for UC or UD QPs BTH_ACK_MASK bit is used to indicate that an acknowledge (for this packet) should be scheduled by the responder. Both UC and UD QPs are unacknowledged, so don't set BTH_ACK_MASK for UC or UD QPs. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Honggang LI Link: https://lore.kernel.org/r/20240624020348.494338-1-honggangli@163.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index cd14c4c2dff9..479c07e6e4ed 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -424,7 +424,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, int paylen; int solicited; u32 qp_num; - int ack_req; + int ack_req = 0; /* length from start of bth to end of icrc */ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; @@ -445,8 +445,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : qp->attr.dest_qp_num; - ack_req = ((pkt->mask & RXE_END_MASK) || - (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (qp_type(qp) != IB_QPT_UD && qp_type(qp) != IB_QPT_UC) + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); if (ack_req) qp->req.noack_pkts = 0; From 5a905e33b266f504e0b290055125e5002635e2e6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 24 Jun 2024 22:13:27 +0200 Subject: [PATCH 27/65] RDMA/hfi1: Constify struct mmu_rb_ops 'struct mmu_rb_ops' is not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 10879 164 0 11043 2b23 drivers/infiniband/hw/hfi1/pin_system.o After: ===== text data bss dec hex filename 10907 140 0 11047 2b27 drivers/infiniband/hw/hfi1/pin_system.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/b826dd05eefa5f4d6a7a1b4d191eaf37c714ed04.1719259997.git.christophe.jaillet@wanadoo.fr Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mmu_rb.c | 2 +- drivers/infiniband/hw/hfi1/mmu_rb.h | 4 ++-- drivers/infiniband/hw/hfi1/pin_system.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index d4a6acad0e65..67a5c410fb5e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -40,7 +40,7 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) } int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler) { diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 8e5d05454d70..3fa50dd64db6 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -42,7 +42,7 @@ struct mmu_rb_handler { /* Begin on a new cachline boundary here */ struct rb_root_cached root ____cacheline_aligned_in_smp; void *ops_arg; - struct mmu_rb_ops *ops; + const struct mmu_rb_ops *ops; struct list_head lru_list; struct work_struct del_work; struct list_head del_list; @@ -51,7 +51,7 @@ struct mmu_rb_handler { }; int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler); void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); diff --git a/drivers/infiniband/hw/hfi1/pin_system.c b/drivers/infiniband/hw/hfi1/pin_system.c index 384f722093e0..cce56134519b 100644 --- a/drivers/infiniband/hw/hfi1/pin_system.c +++ b/drivers/infiniband/hw/hfi1/pin_system.c @@ -26,7 +26,7 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); -static struct mmu_rb_ops sdma_rb_ops = { +static const struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, .evict = sdma_rb_evict, .remove = sdma_rb_remove, From 47f9b4190a1d9db2db670c01af7deb648073e218 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Jun 2024 16:09:14 +0000 Subject: [PATCH 28/65] RDMA/efa: Use offset_in_page() function Use offset_in_page() instead of open-coding it. Link: https://lore.kernel.org/r/20240624160918.27060-2-mrgolin@amazon.com Reviewed-by: Yossi Leybovich Reviewed-by: Firas Jahjah Signed-off-by: Gal Pressman Signed-off-by: Michael Margolin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 8f7a13b79cdc..eee2ae215414 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -524,7 +524,7 @@ static int qp_mmap_entries_setup(struct efa_qp *qp, address = dev->mem_bar_addr + resp->llq_desc_offset; length = PAGE_ALIGN(params->sq_ring_size_in_bytes + - (resp->llq_desc_offset & ~PAGE_MASK)); + offset_in_page(resp->llq_desc_offset)); qp->llq_desc_mmap_entry = efa_user_mmap_entry_insert(&ucontext->ibucontext, From fe0812e4bcfa33d640cfd0e2dc99dfeef830fa54 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 24 Jun 2024 16:09:15 +0000 Subject: [PATCH 29/65] RDMA/efa: Remove duplicate aenq enable macro We have the same macro in main and verbs files and we don't use the macro in the verbs file, remove it. Link: https://lore.kernel.org/r/20240624160918.27060-3-mrgolin@amazon.com Reviewed-by: Yossi Leybovich Signed-off-by: Yonatan Nachum Signed-off-by: Michael Margolin Reviewed-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index eee2ae215414..cd1f735d08a7 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -26,10 +26,6 @@ enum { EFA_MMAP_IO_NC, }; -#define EFA_AENQ_ENABLED_GROUPS \ - (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ - BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) - struct efa_user_mmap_entry { struct rdma_user_mmap_entry rdma_entry; u64 address; From a9f6282b27762fa9e33187d84dd562c98c7d93de Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 27 Jun 2024 21:23:04 +0530 Subject: [PATCH 30/65] MAINTAINERS: Update Maintainers for irdma driver Remove Shiraz Saleem and add Tatyana Nikolova as co-maintainer for irdma driver. Link: https://lore.kernel.org/r/20240627155304.219-1-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Acked-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index aacccb376c28..c601030429bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11066,7 +11066,7 @@ F: include/linux/net/intel/iidc.h INTEL ETHERNET PROTOCOL DRIVER FOR RDMA M: Mustafa Ismail -M: Shiraz Saleem +M: Tatyana Nikolova L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/irdma/ From dd6d7f8574d7f8b6a0bf1aeef0b285d2706b8c2a Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Thu, 27 Jun 2024 21:23:49 +0300 Subject: [PATCH 31/65] RDMA: Pass entire uverbs attr bundle to create cq function Changes the create_cq verb signature by sending the entire uverbs attr bundle as a parameter. This allows drivers to send driver specific attrs through ioctl for the create_cq verb and access them in their driver specific code. Also adds a new enum value for driver specific ioctl attributes for methods already supporting UHW. Link: https://lore.kernel.org/r/ed147343987c0d43fd391c1b2f85e2f425747387.1719512393.git.leon@kernel.org Signed-off-by: Akiva Goldberger Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_std_types_cq.c | 2 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 ++- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 +- drivers/infiniband/hw/cxgb4/cq.c | 3 ++- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +- drivers/infiniband/hw/efa/efa.h | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 3 ++- drivers/infiniband/hw/erdma/erdma_verbs.c | 3 ++- drivers/infiniband/hw/erdma/erdma_verbs.h | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 3 ++- drivers/infiniband/hw/hns/hns_roce_device.h | 2 +- drivers/infiniband/hw/irdma/verbs.c | 5 +++-- drivers/infiniband/hw/mana/cq.c | 3 ++- drivers/infiniband/hw/mana/mana_ib.h | 2 +- drivers/infiniband/hw/mlx4/cq.c | 3 ++- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 +- drivers/infiniband/hw/mlx5/cq.c | 3 ++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 2 +- drivers/infiniband/hw/qedr/verbs.c | 3 ++- drivers/infiniband/hw/qedr/verbs.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 5 +++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 6 ++++-- drivers/infiniband/sw/rdmavt/cq.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++- drivers/infiniband/sw/siw/siw_verbs.c | 5 +++-- drivers/infiniband/sw/siw/siw_verbs.h | 2 +- include/rdma/ib_verbs.h | 2 +- include/uapi/rdma/ib_user_ioctl_cmds.h | 7 +++---- 35 files changed, 58 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3d3ee3eca983..1b3ea71f2c33 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1051,7 +1051,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs, rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; rdma_restrack_add(&cq->res); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 370ad7c83f88..432054f0a8a4 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -128,7 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index d261b09025ca..e453ca701e87 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2948,10 +2948,11 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) } int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); + struct ib_udata *udata = &attrs->driver_udata; struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index b267d6d5975f..e98cb1717338 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -221,7 +221,7 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 7e2835dcbc1c..5111421f9473 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -995,8 +995,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) } int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index fb8a0c248866..f838bb6718af 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -978,7 +978,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void c4iw_cq_rem_ref(struct c4iw_cq *chp); int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask srq_attr_mask, diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 926f9ff1f60f..e580e087e9da 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -161,7 +161,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index cd1f735d08a7..b1e0a1b7c59d 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1080,8 +1080,9 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, } int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct efa_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct efa_ucontext, ibucontext); struct efa_com_create_cq_params params = {}; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 40c9b6e46b82..d7e1cbf9f5c2 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1628,8 +1628,9 @@ err_out: } int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct erdma_cq *cq = to_ecq(ibcq); struct erdma_dev *dev = to_edev(ibcq->device); unsigned int depth = attr->cqe; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 4f02ba06b210..6afdc02f5869 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -329,7 +329,7 @@ int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, int erdma_get_port_immutable(struct ib_device *dev, u32 port, struct ib_port_immutable *ib_port_immutable); int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *data); + struct uverbs_attr_bundle *attrs); int erdma_query_port(struct ib_device *dev, u32 port, struct ib_port_attr *attr); int erdma_query_gid(struct ib_device *dev, u32 port, int idx, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 56dc3908da2f..4ec66611a143 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -353,9 +353,10 @@ static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); + struct ib_udata *udata = &attrs->driver_udata; struct hns_roce_ib_create_cq_resp resp = {}; struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct ib_device *ibdev = &hr_dev->ib_dev; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ff0b3f68ee3a..ef50cd03f489 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1267,7 +1267,7 @@ __be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 12704efb7b19..fc0ce35da14e 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2035,14 +2035,15 @@ static inline int cq_validate_flags(u32 flags, u8 hw_rev) * irdma_create_cq - create cq * @ibcq: CQ allocated * @attr: attributes for cq - * @udata: user data + * @attrs: uverbs attribute bundle */ static int irdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { #define IRDMA_CREATE_CQ_MIN_REQ_LEN offsetofend(struct irdma_create_cq_req, user_cq_buf) #define IRDMA_CREATE_CQ_MIN_RESP_LEN offsetofend(struct irdma_create_cq_resp, cq_size) + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct irdma_device *iwdev = to_iwdev(ibdev); struct irdma_pci_f *rf = iwdev->rf; diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index c6a3fd57a196..f04a679d2871 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -6,8 +6,9 @@ #include "mana_ib.h" int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); struct mana_ib_create_cq_resp resp = {}; struct mana_ib_ucontext *mana_ucontext; diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 977da9569701..b53a5b4de908 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -429,7 +429,7 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port); int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 4cd738aae53c..aa9ea6ba26e5 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -172,8 +172,9 @@ err_buf: #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 41ca1114a995..b52bceff7d97 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -767,7 +767,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9773d2a3d97f..eebfd6fa4d9b 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -942,8 +942,9 @@ static void notify_soft_wc_handler(struct work_struct *work) } int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 74a20df8b292..20c9dbbf17bb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1328,7 +1328,7 @@ int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index e1325f2927d6..6a1e2e79ddc3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -574,8 +574,9 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) static int mthca_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct mthca_create_cq ucmd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c849fdbd4c99..979de8f8df14 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -963,8 +963,9 @@ err: } int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index f860b7fcef33..0644346d8d98 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -70,7 +70,7 @@ int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f118ce0a9a61..568a5b18803f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -900,8 +900,9 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) } int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct qedr_ucontext *ctx = rdma_udata_to_drv_context( udata, struct qedr_ucontext, ibucontext); diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 081753df79ef..5731458abb06 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -52,7 +52,7 @@ int qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int qedr_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_dealloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int qedr_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6289238cc5af..217af34e82b3 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -577,7 +577,7 @@ out_unlock: } int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { if (attr->flags) return -EOPNOTSUPP; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 6ca9ee0dddbe..53f53f2d53be 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -56,7 +56,7 @@ int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 6aa40bd2fd52..b3df6eb9b8ef 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -94,13 +94,14 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, * pvrdma_create_cq - create completion queue * @ibcq: Allocated CQ * @attr: completion queue attributes - * @udata: user data + * @attrs: bundle * * @return: 0 on success */ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct pvrdma_dev *dev = to_vdev(ibdev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 78807b23d831..4b9edc03d73d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -375,7 +375,7 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 82c3f5932249..0ca2743f1075 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -5,6 +5,7 @@ #include #include +#include #include "cq.h" #include "vt.h" #include "trace.h" @@ -149,15 +150,16 @@ static void send_complete(struct work_struct *work) * rvt_create_cq - create a completion queue * @ibcq: Allocated CQ * @attr: creation attributes - * @udata: user data for libibverbs.so + * @attrs: uverbs bundle * * Called by ib_create_cq() in the generic verbs code. * * Return: 0 on success */ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index d49b6d1a26cb..4028702a7b2f 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -10,7 +10,7 @@ #include int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index c7d4d8ab5a09..82bb7f33290c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1053,8 +1053,9 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index ecf0444666b4..986666c19378 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1124,12 +1124,13 @@ int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) * * @base_cq: CQ as allocated by RDMA midlayer * @attr: Initial CQ attributes - * @udata: relates to user context + * @attrs: uverbs bundle */ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct siw_device *sdev = to_siw_dev(base_cq->device); struct siw_cq *cq = to_siw_cq(base_cq); int rv, size = attr->cqe; diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h index 4b57a4fb7237..1f1a305540af 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.h +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -43,7 +43,7 @@ int siw_get_port_immutable(struct ib_device *base_dev, u32 port, int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, struct ib_udata *udata); int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr); int siw_query_gid(struct ib_device *base_dev, u32 port, int idx, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5a193008f99c..825043512b2d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2465,7 +2465,7 @@ struct ib_device_ops { int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index dafc7ebe545b..ec719053aab9 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -37,9 +37,6 @@ #define UVERBS_ID_NS_MASK 0xF000 #define UVERBS_ID_NS_SHIFT 12 -#define UVERBS_UDATA_DRIVER_DATA_NS 1 -#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) - enum uverbs_default_objects { UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ UVERBS_OBJECT_PD, @@ -61,8 +58,10 @@ enum uverbs_default_objects { }; enum { - UVERBS_ATTR_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, + UVERBS_ID_DRIVER_NS = 1UL << UVERBS_ID_NS_SHIFT, + UVERBS_ATTR_UHW_IN = UVERBS_ID_DRIVER_NS, UVERBS_ATTR_UHW_OUT, + UVERBS_ID_DRIVER_NS_WITH_UHW, }; enum uverbs_methods_device { From 589b844f1bf04850d9fabcaa2e943325dc6768b4 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Thu, 27 Jun 2024 21:23:50 +0300 Subject: [PATCH 32/65] RDMA/mlx5: Send UAR page index as ioctl attribute Add UAR page index as a driver ioctl attribute to increase the number of supported indices, previously limited to 16 bits by mlx5_ib_create_cq struct. Link: https://lore.kernel.org/r/0e18b34d7ec3b1ae02d694b0d545aed7413c0ef7.1719512393.git.leon@kernel.org Signed-off-by: Akiva Goldberger Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 28 +++++++++++++++++++++--- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 4 ++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index eebfd6fa4d9b..4c54dc578069 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -38,6 +38,9 @@ #include "srq.h" #include "qp.h" +#define UVERBS_MODULE_NAME mlx5_ib +#include + static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; @@ -714,7 +717,8 @@ static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct mlx5_ib_cq *cq, int entries, u32 **cqb, - int *cqe_size, int *index, int *inlen) + int *cqe_size, int *index, int *inlen, + struct uverbs_attr_bundle *attrs) { struct mlx5_ib_create_cq ucmd = {}; unsigned long page_size; @@ -788,7 +792,11 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(cqc, cqc, page_offset, page_offset_quantized); - if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX)) { + err = uverbs_copy_from(index, attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX); + if (err) + goto err_cqb; + } else if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { *index = ucmd.uar_page_index; } else if (context->bfregi.lib_uar_dyn) { err = -EINVAL; @@ -981,7 +989,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (udata) { err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, - &index, &inlen); + &index, &inlen, attrs); if (err) return err; } else { @@ -1443,3 +1451,17 @@ int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) return 0; } + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_cq_create, + UVERBS_OBJECT_CQ, + UVERBS_METHOD_CQ_CREATE, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + +const struct uapi_definition mlx5_ib_create_cq_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_CQ, &mlx5_ib_cq_create), + {}, +}; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index cee167cf310e..4adcbbafb777 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3750,6 +3750,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_qos_defs), UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), UAPI_DEF_CHAIN(mlx5_ib_dm_defs), + UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 20c9dbbf17bb..8f4618dee869 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1528,6 +1528,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; extern const struct uapi_definition mlx5_ib_qos_defs[]; extern const struct uapi_definition mlx5_ib_std_types_defs[]; +extern const struct uapi_definition mlx5_ib_create_cq_defs[]; static inline int is_qp1(enum ib_qp_type qp_type) { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 595edad03dfe..5b74d6534899 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -270,6 +270,10 @@ enum mlx5_ib_device_query_context_attrs { MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_create_cq_attrs { + MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX = UVERBS_ID_DRIVER_NS_WITH_UHW, +}; + #define MLX5_IB_DW_MATCH_PARAM 0xA0 struct mlx5_ib_match_params { From 917918f57a7b139c043e78c502876f2c286f4f0a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 24 Jun 2024 16:24:32 +0300 Subject: [PATCH 33/65] RDMA/device: Return error earlier if port in not valid There is no need to allocate port data if port provided is not valid. Fixes: c2261dd76b54 ("RDMA/device: Add ib_device_set_netdev() as an alternative to get_netdev") Link: https://lore.kernel.org/r/022047a8b16988fc88d4426da50bf60a4833311b.1719235449.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 55aa7aa32d4a..e0cff28bb0ef 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2146,6 +2146,9 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned long flags; int ret; + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. @@ -2154,9 +2157,6 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, if (ret) return ret; - if (!rdma_is_port_valid(ib_dev, port)) - return -EINVAL; - pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( From 8e6e5ac7c468cf330c55158fcaa4ab5ad84949c9 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 26 Jun 2024 19:41:03 -0700 Subject: [PATCH 34/65] RDMA/bnxt_re: Update the correct DB FIFO depth and mask for GenP7 GenP5 and P7 devices have different DB FIFO depth. Use different values based on the chip context. Instead of hardcoding doorbell FIFO related values, get it from the HWRM interface. Maintain backward compatibility by having default values when FW is not providing the doorbell FIFO related values. Signed-off-by: Chandramohan Akula Signed-off-by: Ajit Khaparde Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1719456065-27394-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 12 ++++++-- drivers/infiniband/hw/bnxt_re/main.c | 38 ++++++++++++++++--------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 9dca451ed522..0880b4b745ae 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -131,9 +131,15 @@ struct bnxt_re_pacing { #define BNXT_RE_PACING_ALARM_TH_MULTIPLE 2 /* Multiple of pacing algo threshold */ /* Default do_pacing value when there is no congestion */ #define BNXT_RE_DBR_DO_PACING_NO_CONGESTION 0x7F /* 1 in 512 probability */ -#define BNXT_RE_DB_FIFO_ROOM_MASK 0x1FFF8000 -#define BNXT_RE_MAX_FIFO_DEPTH 0x2c00 -#define BNXT_RE_DB_FIFO_ROOM_SHIFT 15 + +#define BNXT_RE_MAX_FIFO_DEPTH_P5 0x2c00 +#define BNXT_RE_MAX_FIFO_DEPTH_P7 0x8000 + +#define BNXT_RE_MAX_FIFO_DEPTH(ctx) \ + (bnxt_qplib_is_chip_gen_p7((ctx)) ? \ + BNXT_RE_MAX_FIFO_DEPTH_P7 :\ + BNXT_RE_MAX_FIFO_DEPTH_P5) + #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 #define MAX_CQ_HASH_BITS (16) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 54b4d2f3a5d8..2a727f4e5584 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -444,6 +444,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; struct hwrm_func_dbr_pacing_qcfg_output resp = {}; struct hwrm_func_dbr_pacing_qcfg_input req = {}; struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -465,6 +466,13 @@ static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) cctx->dbr_stat_db_fifo = le32_to_cpu(resp.dbr_stat_db_fifo_reg) & ~FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_MASK; + + pacing_data->fifo_max_depth = le32_to_cpu(resp.dbr_stat_db_max_fifo_depth); + if (!pacing_data->fifo_max_depth) + pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH(cctx); + pacing_data->fifo_room_mask = le32_to_cpu(resp.dbr_stat_db_fifo_reg_fifo_room_mask); + pacing_data->fifo_room_shift = resp.dbr_stat_db_fifo_reg_fifo_room_shift; + return 0; } @@ -481,6 +489,7 @@ static void bnxt_re_set_default_pacing_data(struct bnxt_re_dev *rdev) static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; u32 read_val, fifo_occup; /* loop shouldn't run infintely as the occupancy usually goes @@ -488,14 +497,14 @@ static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) */ while (1) { read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = pacing_data->fifo_max_depth - + ((read_val & pacing_data->fifo_room_mask) >> + pacing_data->fifo_room_shift); /* Fifo occupancy cannot be greater the MAX FIFO depth */ - if (fifo_occup > BNXT_RE_MAX_FIFO_DEPTH) + if (fifo_occup > pacing_data->fifo_max_depth) break; - if (fifo_occup < rdev->qplib_res.pacing_data->pacing_th) + if (fifo_occup < pacing_data->pacing_th) break; } } @@ -553,9 +562,9 @@ static void bnxt_re_pacing_timer_exp(struct work_struct *work) pacing_data = rdev->qplib_res.pacing_data; read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = pacing_data->fifo_max_depth - + ((read_val & pacing_data->fifo_room_mask) >> + pacing_data->fifo_room_shift); if (fifo_occup > pacing_data->pacing_th) goto restart_timer; @@ -594,7 +603,7 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) * Increase the alarm_th to max so that other user lib instances do not * keep alerting the driver. */ - pacing_data->alarm_th = BNXT_RE_MAX_FIFO_DEPTH; + pacing_data->alarm_th = pacing_data->fifo_max_depth; pacing_data->do_pacing = BNXT_RE_MAX_DBR_DO_PACING; cancel_work_sync(&rdev->dbq_fifo_check_work); schedule_work(&rdev->dbq_fifo_check_work); @@ -603,8 +612,6 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) { - if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) - return -EIO; /* Allocate a page for app use */ rdev->pacing.dbr_page = (void *)__get_free_page(GFP_KERNEL); @@ -614,6 +621,12 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) memset((u8 *)rdev->pacing.dbr_page, 0, PAGE_SIZE); rdev->qplib_res.pacing_data = (struct bnxt_qplib_db_pacing_data *)rdev->pacing.dbr_page; + if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + /* MAP HW window 2 for reading db fifo depth */ writel(rdev->chip_ctx->dbr_stat_db_fifo & BNXT_GRC_BASE_MASK, rdev->en_dev->bar0 + BNXT_GRCPF_REG_WINDOW_BASE_OUT + 4); @@ -627,9 +640,6 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) rdev->pacing.dbq_pacing_time = BNXT_RE_DBR_PACING_TIME; rdev->pacing.dbr_def_do_pacing = BNXT_RE_DBR_DO_PACING_NO_CONGESTION; rdev->pacing.do_pacing_save = rdev->pacing.dbr_def_do_pacing; - rdev->qplib_res.pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH; - rdev->qplib_res.pacing_data->fifo_room_mask = BNXT_RE_DB_FIFO_ROOM_MASK; - rdev->qplib_res.pacing_data->fifo_room_shift = BNXT_RE_DB_FIFO_ROOM_SHIFT; rdev->qplib_res.pacing_data->grc_reg_offset = rdev->pacing.dbr_db_fifo_reg_off; bnxt_re_set_default_pacing_data(rdev); /* Initialize worker for DBR Pacing */ From f2f4dc9124019dfc8c4f41a344ee43bffaa36d3f Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 26 Jun 2024 19:41:04 -0700 Subject: [PATCH 35/65] RDMA/bnxt_re: Enable DB moderation for genP7 adapters Enable DB moderation support for GenP7 adapters also. Query from FW and update the status. Signed-off-by: Chandramohan Akula Signed-off-by: Ajit Khaparde Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1719456065-27394-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 2a727f4e5584..2c5282fdc9f7 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -423,6 +423,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) struct hwrm_func_qcaps_input req = {}; struct bnxt_qplib_chip_ctx *cctx; struct bnxt_fw_msg fw_msg = {}; + u32 flags_ext2; int rc; cctx = rdev->chip_ctx; @@ -436,9 +437,9 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) return rc; cctx->modes.db_push = le32_to_cpu(resp.flags) & FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE; - cctx->modes.dbr_pacing = - le32_to_cpu(resp.flags_ext2) & - FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED; + flags_ext2 = le32_to_cpu(resp.flags_ext2); + cctx->modes.dbr_pacing = flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED || + flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED; return 0; } From 24943dcdc156cf294d97a36bf5c51168bf574c22 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 26 Jun 2024 19:41:05 -0700 Subject: [PATCH 36/65] RDMA/bnxt_re: Disable doorbell moderation if hardware register read fails If the HW register read fails, the FIFO will be always shown as full. DB moderation doesn't work in that case and the traffic fails. So disable this feature and log a message. Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1719456065-27394-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 45 +++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 2c5282fdc9f7..9714b9ab7524 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -488,19 +488,40 @@ static void bnxt_re_set_default_pacing_data(struct bnxt_re_dev *rdev) pacing_data->pacing_th * BNXT_RE_PACING_ALARM_TH_MULTIPLE; } -static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +static u32 __get_fifo_occupancy(struct bnxt_re_dev *rdev) { struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; u32 read_val, fifo_occup; + read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); + fifo_occup = pacing_data->fifo_max_depth - + ((read_val & pacing_data->fifo_room_mask) >> + pacing_data->fifo_room_shift); + return fifo_occup; +} + +static bool is_dbr_fifo_full(struct bnxt_re_dev *rdev) +{ + u32 max_occup, fifo_occup; + + fifo_occup = __get_fifo_occupancy(rdev); + max_occup = BNXT_RE_MAX_FIFO_DEPTH(rdev->chip_ctx) - 1; + if (fifo_occup == max_occup) + return true; + + return false; +} + +static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; + u32 fifo_occup; + /* loop shouldn't run infintely as the occupancy usually goes * below pacing algo threshold as soon as pacing kicks in. */ while (1) { - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = pacing_data->fifo_max_depth - - ((read_val & pacing_data->fifo_room_mask) >> - pacing_data->fifo_room_shift); + fifo_occup = __get_fifo_occupancy(rdev); /* Fifo occupancy cannot be greater the MAX FIFO depth */ if (fifo_occup > pacing_data->fifo_max_depth) break; @@ -556,16 +577,13 @@ static void bnxt_re_pacing_timer_exp(struct work_struct *work) struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev, dbq_pacing_work.work); struct bnxt_qplib_db_pacing_data *pacing_data; - u32 read_val, fifo_occup; + u32 fifo_occup; if (!mutex_trylock(&rdev->pacing.dbq_lock)) return; pacing_data = rdev->qplib_res.pacing_data; - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = pacing_data->fifo_max_depth - - ((read_val & pacing_data->fifo_room_mask) >> - pacing_data->fifo_room_shift); + fifo_occup = __get_fifo_occupancy(rdev); if (fifo_occup > pacing_data->pacing_th) goto restart_timer; @@ -613,7 +631,6 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) { - /* Allocate a page for app use */ rdev->pacing.dbr_page = (void *)__get_free_page(GFP_KERNEL); if (!rdev->pacing.dbr_page) @@ -637,6 +654,12 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) rdev->pacing.dbr_bar_addr = pci_resource_start(rdev->qplib_res.pdev, 0) + rdev->pacing.dbr_db_fifo_reg_off; + if (is_dbr_fifo_full(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + rdev->pacing.pacing_algo_th = BNXT_RE_PACING_ALGO_THRESHOLD; rdev->pacing.dbq_pacing_time = BNXT_RE_DBR_PACING_TIME; rdev->pacing.dbr_def_do_pacing = BNXT_RE_DBR_DO_PACING_NO_CONGESTION; From 50660c5197f52b8137e223dc3ba8d43661179a1d Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:33 +0300 Subject: [PATCH 37/65] RDMA/core: Create "issm*" device nodes only when SMI is supported For an IB port create it's issm device node only when it has SMI capability. In following patches mlx5 is going to support IB devices without this cap. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/359f73c9a388d5e3ae971e40d8507888b1ba6f93.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/user_mad.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 2ed749f50a29..f760dfffa188 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1321,15 +1321,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (ret) goto err_cdev; - ib_umad_init_port_dev(&port->sm_dev, port, device); - port->sm_dev.devt = base_issm; - dev_set_name(&port->sm_dev, "issm%d", port->dev_num); - cdev_init(&port->sm_cdev, &umad_sm_fops); - port->sm_cdev.owner = THIS_MODULE; + if (rdma_cap_ib_smi(device, port_num)) { + ib_umad_init_port_dev(&port->sm_dev, port, device); + port->sm_dev.devt = base_issm; + dev_set_name(&port->sm_dev, "issm%d", port->dev_num); + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; - ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); - if (ret) - goto err_dev; + ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); + if (ret) + goto err_dev; + } return 0; @@ -1345,9 +1347,13 @@ err_cdev: static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; + bool has_smi = false; int id; - cdev_device_del(&port->sm_cdev, &port->sm_dev); + if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) { + cdev_device_del(&port->sm_cdev, &port->sm_dev); + has_smi = true; + } cdev_device_del(&port->cdev, &port->dev); mutex_lock(&port->file_mutex); @@ -1373,7 +1379,8 @@ static void ib_umad_kill_port(struct ib_umad_port *port) ida_free(&umad_ida, port->dev_num); /* balances device_initialize() */ - put_device(&port->sm_dev); + if (has_smi) + put_device(&port->sm_dev); put_device(&port->dev); } From 65528cfb21fdb68de8ae6dccae19af180d93e143 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:34 +0300 Subject: [PATCH 38/65] net/mlx5: mlx5_ifc update for multi-plane support Add new fields to support mlx5 multi-plane feature. Actual support will be added in following patches. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/36a74a1b1d2b7b59c99cda4abad1794ddde30230.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 09d9d87d62c6..61738990e399 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -793,7 +793,7 @@ struct mlx5_ifc_ads_bits { u8 reserved_at_2[0xe]; u8 pkey_index[0x10]; - u8 reserved_at_20[0x8]; + u8 plane_index[0x8]; u8 grh[0x1]; u8 mlid[0x7]; u8 rlid[0x10]; @@ -1990,7 +1990,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_c0[0x8]; u8 migration_multi_load[0x1]; u8 migration_tracking_state[0x1]; - u8 reserved_at_ca[0x6]; + u8 multiplane_qp_ud[0x1]; + u8 reserved_at_cb[0x5]; u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0xf]; @@ -4172,7 +4173,8 @@ struct mlx5_ifc_hca_vport_context_bits { u8 has_smi[0x1]; u8 has_raw[0x1]; u8 grh_required[0x1]; - u8 reserved_at_104[0xc]; + u8 reserved_at_104[0x4]; + u8 num_port_plane[0x8]; u8 port_physical_state[0x4]; u8 vport_state_policy[0x4]; u8 port_state[0x4]; @@ -7692,7 +7694,7 @@ struct mlx5_ifc_mad_ifc_in_bits { u8 op_mod[0x10]; u8 remote_lid[0x10]; - u8 reserved_at_50[0x8]; + u8 plane_index[0x8]; u8 port[0x8]; u8 reserved_at_60[0x20]; @@ -9621,7 +9623,9 @@ struct mlx5_ifc_ptys_reg_bits { u8 an_disable_cap[0x1]; u8 reserved_at_3[0x5]; u8 local_port[0x8]; - u8 reserved_at_10[0xd]; + u8 reserved_at_10[0x8]; + u8 plane_ind[0x4]; + u8 reserved_at_1c[0x1]; u8 proto_mask[0x3]; u8 an_status[0x4]; From 2a5db20fa532198639671713c6213f96ff285b85 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:35 +0300 Subject: [PATCH 39/65] RDMA/mlx5: Add support to multi-plane device and port When multi-plane is supported, a logical port, which is aggregation of multiple physical plane ports, is exposed for data transmission. Compared with a normal mlx5 IB port, this logical port supports all functionalities except Subnet Management. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/7e37c06c9cb243be9ac79930cd17053903785b95.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 60 ++++++++++++++++--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + .../net/ethernet/mellanox/mlx5/core/vport.c | 1 + include/linux/mlx5/driver.h | 1 + 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4adcbbafb777..4fed8d1ed819 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1357,7 +1357,13 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->sm_sl = rep->sm_sl; props->state = rep->vport_state; props->phys_state = rep->port_physical_state; - props->port_cap_flags = rep->cap_mask1; + + props->port_cap_flags = rep->cap_mask1; + if (dev->num_plane) { + props->port_cap_flags |= IB_PORT_SM_DISABLED; + props->port_cap_flags &= ~IB_PORT_SM; + } + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); @@ -2776,6 +2782,23 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + + *num_plane = 0; + if (!MLX5_CAP_GEN(mdev, ib_virt)) + return 0; + + err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); + if (err) + return err; + + *num_plane = vport_ctx.num_plane; + return 0; +} + static int set_has_smi_cap(struct mlx5_ib_dev *dev) { struct mlx5_hca_vport_context vport_ctx; @@ -2786,10 +2809,14 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) return 0; for (port = 1; port <= dev->num_ports; port++) { - if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + if (dev->num_plane) { + dev->port_caps[port - 1].has_smi = false; + continue; + } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { dev->port_caps[port - 1].has_smi = true; continue; } + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, &vport_ctx); if (err) { @@ -2995,6 +3022,11 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, if (rep->grh_required) ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + if (dev->num_plane) + return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD | + RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA | + RDMA_CORE_CAP_AF_IB; + if (ll == IB_LINK_LAYER_INFINIBAND) return ret | RDMA_CORE_PORT_IBA_IB; @@ -4477,11 +4509,18 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!dev) return -ENOMEM; + + if (ll == IB_LINK_LAYER_INFINIBAND) { + ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane); + if (ret) + goto fail; + } + dev->port = kcalloc(num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) { - ib_dealloc_device(&dev->ib_dev); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dev->mdev = mdev; @@ -4493,14 +4532,17 @@ static int mlx5r_probe(struct auxiliary_device *adev, profile = &pf_profile; ret = __mlx5_ib_add(dev, profile); - if (ret) { - kfree(dev->port); - ib_dealloc_device(&dev->ib_dev); - return ret; - } + if (ret) + goto fail_ib_add; auxiliary_set_drvdata(adev, dev); return 0; + +fail_ib_add: + kfree(dev->port); +fail: + ib_dealloc_device(&dev->ib_dev); + return ret; } static void mlx5r_remove(struct auxiliary_device *adev) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8f4618dee869..49a5eebe69b8 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1189,6 +1189,8 @@ struct mlx5_ib_dev { #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; #endif + + u8 num_plane; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 1005bb6935b6..0d5f750faa45 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -737,6 +737,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required); rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx, system_image_guid); + rep->num_plane = MLX5_GET_PR(hca_vport_context, ctx, num_port_plane); ex: kvfree(out); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0d31f77396fc..a96438ded15f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -917,6 +917,7 @@ struct mlx5_hca_vport_context { u16 qkey_violation_counter; u16 pkey_violation_counter; bool grh_required; + u8 num_plane; }; #define STRUCT_FIELD(header, field) \ From bca51197620a257e2954be99b16f05115c3b2630 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:36 +0300 Subject: [PATCH 40/65] RDMA/core: Support IB sub device with type "SMI" This patch adds 2 APIs, as well as driver operations to support adding and deleting an IB sub device, which provides part of functionalities of it's parent. A sub device has a type; for a sub device with type "SMI", it provides the smi capability through umad for its parent, meaning uverb is not supported. A sub device cannot live without a parent. So when a parent is released, all it's sub devices are released as well. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/44253f7508b21eb2caefea3980c2bc072869116c.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 69 +++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 3 +- include/rdma/ib_verbs.h | 43 +++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 5 ++ 4 files changed, 119 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index e0cff28bb0ef..7b418c717f29 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -503,6 +503,7 @@ static void ib_device_release(struct device *device) rcu_head); } + mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); @@ -641,6 +642,11 @@ struct ib_device *_ib_alloc_device(size_t size) BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); + + mutex_init(&device->subdev_lock); + INIT_LIST_HEAD(&device->subdev_list_head); + INIT_LIST_HEAD(&device->subdev_list); + return device; } EXPORT_SYMBOL(_ib_alloc_device); @@ -1461,6 +1467,18 @@ EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { + struct ib_device *sub, *tmp; + + mutex_lock(&ib_dev->subdev_lock); + list_for_each_entry_safe_reverse(sub, tmp, + &ib_dev->subdev_list_head, + subdev_list) { + list_del(&sub->subdev_list); + ib_dev->ops.del_sub_dev(sub); + ib_device_put(ib_dev); + } + mutex_unlock(&ib_dev->subdev_lock); + /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely @@ -2597,6 +2615,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); + SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); @@ -2631,6 +2650,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); + SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); @@ -2727,6 +2747,55 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) } EXPORT_SYMBOL(ib_set_device_ops); +int ib_add_sub_device(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct ib_device *sub; + int ret = 0; + + if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) + return -EOPNOTSUPP; + + if (!ib_device_try_get(parent)) + return -EINVAL; + + sub = parent->ops.add_sub_dev(parent, type, name); + if (IS_ERR(sub)) { + ib_device_put(parent); + return PTR_ERR(sub); + } + + sub->type = type; + sub->parent = parent; + + mutex_lock(&parent->subdev_lock); + list_add_tail(&parent->subdev_list_head, &sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + return ret; +} +EXPORT_SYMBOL(ib_add_sub_device); + +int ib_del_sub_device_and_put(struct ib_device *sub) +{ + struct ib_device *parent = sub->parent; + + if (!parent) + return -EOPNOTSUPP; + + mutex_lock(&parent->subdev_lock); + list_del(&sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + ib_device_put(sub); + parent->ops.del_sub_dev(sub); + ib_device_put(parent); + + return 0; +} +EXPORT_SYMBOL(ib_del_sub_device_and_put); + #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 495d5a5d0373..bc099287de9a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1114,7 +1114,8 @@ static int ib_uverbs_add_one(struct ib_device *device) struct ib_uverbs_device *uverbs_dev; int ret; - if (!device->ops.alloc_ucontext) + if (!device->ops.alloc_ucontext || + device->type == RDMA_DEVICE_TYPE_SMI) return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 825043512b2d..d4128958908f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2663,6 +2663,18 @@ struct ib_device_ops { */ int (*get_numa_node)(struct ib_device *dev); + /** + * add_sub_dev - Add a sub IB device + */ + struct ib_device *(*add_sub_dev)(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); + + /** + * del_sub_dev - Delete a sub IB device + */ + void (*del_sub_dev)(struct ib_device *sub_dev); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); @@ -2773,6 +2785,15 @@ struct ib_device { char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; u32 lag_flags; + + /* A parent device has a list of sub-devices */ + struct mutex subdev_lock; + struct list_head subdev_list_head; + + /* A sub device has a type and a parent */ + enum rdma_nl_dev_type type; + struct ib_device *parent; + struct list_head subdev_list; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, @@ -4822,4 +4843,26 @@ static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn) const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port); + +/** ib_add_sub_device - Add a sub IB device on an existing one + * + * @parent: The IB device that needs to add a sub device + * @type: The type of the new sub device + * @name: The name of the new sub device + * + * + * Return 0 on success, an error code otherwise + */ +int ib_add_sub_device(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); + + +/** ib_del_sub_device_and_put - Delect an IB sub device while holding a 'get' + * + * @sub: The sub device that is going to be deleted + * + * Return 0 on success, an error code otherwise + */ +int ib_del_sub_device_and_put(struct ib_device *sub); #endif /* IB_VERBS_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index a214fc259f28..d15ee16be722 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -602,4 +602,9 @@ enum rdma_nl_counter_mask { RDMA_COUNTER_MASK_QP_TYPE = 1, RDMA_COUNTER_MASK_PID = 1 << 1, }; + +/* Supported rdma device types. */ +enum rdma_nl_dev_type { + RDMA_DEVICE_TYPE_SMI = 1, +}; #endif /* _UAPI_RDMA_NETLINK_H */ From 36e97bbc2dca61751c5b4b7f248cd850a7654a19 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:37 +0300 Subject: [PATCH 41/65] RDMA: Set type of rdma_ah to IB for a SMI sub device An address handle created on a SMI port has type IB, as a SMI port it's used for SMI management through umad. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/195be77aae0cce93522269f22f1303d2ccbef605.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d4128958908f..e09d4f09b602 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4662,6 +4662,8 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, return RDMA_AH_ATTR_TYPE_OPA; return RDMA_AH_ATTR_TYPE_IB; } + if (dev->type == RDMA_DEVICE_TYPE_SMI) + return RDMA_AH_ATTR_TYPE_IB; return RDMA_AH_ATTR_TYPE_UNDEFINED; } From a9e0facacfd16117ab69a73e29b1335b4648773d Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:38 +0300 Subject: [PATCH 42/65] RDMA/core: Create GSI QP only when CM is supported GSI QP is not needed if the port doesn't support connection management. In following patches mlx5 is going to support IB ports that doesn't support CM. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/c449ebd955923b0e54c58832fd322f9d461b37a0.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/agent.c | 32 ++++++++++++++++++++++---------- drivers/infiniband/core/mad.c | 9 ++++++--- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index f82b4260de42..3bb46696731e 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -59,7 +59,16 @@ __ib_get_agent_port(const struct ib_device *device, int port_num) struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { - if (entry->agent[1]->device == device && + /* Need to check both agent[0] and agent[1], as an agent port + * may only have one of them + */ + if (entry->agent[0] && + entry->agent[0]->device == device && + entry->agent[0]->port_num == port_num) + return entry; + + if (entry->agent[1] && + entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } @@ -172,14 +181,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num) } } - /* Obtain send only MAD agent for GSI QP */ - port_priv->agent[1] = ib_register_mad_agent(device, port_num, - IB_QPT_GSI, NULL, 0, - &agent_send_handler, - NULL, NULL, 0); - if (IS_ERR(port_priv->agent[1])) { - ret = PTR_ERR(port_priv->agent[1]); - goto error3; + if (rdma_cap_ib_cm(device, port_num)) { + /* Obtain send only MAD agent for GSI QP */ + port_priv->agent[1] = ib_register_mad_agent(device, port_num, + IB_QPT_GSI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[1])) { + ret = PTR_ERR(port_priv->agent[1]); + goto error3; + } } spin_lock_irqsave(&ib_agent_port_list_lock, flags); @@ -212,7 +223,8 @@ int ib_agent_port_close(struct ib_device *device, int port_num) list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - ib_unregister_mad_agent(port_priv->agent[1]); + if (port_priv->agent[1]) + ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 674344eb8e2f..7439e47ff951 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2983,9 +2983,12 @@ static int ib_mad_port_open(struct ib_device *device, if (ret) goto error6; } - ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); - if (ret) - goto error7; + + if (rdma_cap_ib_cm(device, port_num)) { + ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); + if (ret) + goto error7; + } snprintf(name, sizeof(name), "ib_mad%u", port_num); port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); From 026a425990af6969a15b57d6d7fa0138a7e21958 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:39 +0300 Subject: [PATCH 43/65] RDMA/mlx5: Support plane device and driver APIs to add and delete it This patch supports driver APIs "add_sub_dev" and "del_sub_dev", to add and delete a plane device respectively. A mlx5 plane device is a rdma SMI device; It provides the SMI capability through user MAD for it's parent, the logical multi-plane aggregated device. For a plane port: - It supports QP0 only; - When adding a plane device, all plane ports are added; - For some commands like mad_ifc, both plane_index and native portnum is needed; - When querying or modifying a plane port context, the native portnum must be used, as the query/modify_hca_vport_context command doesn't support plane port. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/e933cd0562aece181f8657af2ca0f5b387d0f14e.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cmd.c | 12 ++- drivers/infiniband/hw/mlx5/cmd.h | 2 +- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 116 ++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 8 ++ drivers/infiniband/hw/mlx5/qp.c | 7 +- drivers/infiniband/hw/mlx5/qpc.c | 13 ++- 7 files changed, 147 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 1d0c8d5e745b..895b62cc528d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -177,7 +177,7 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) return mlx5_cmd_exec_in(dev, dealloc_xrcd, in); } -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port) { int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out); @@ -195,12 +195,18 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC); MLX5_SET(mad_ifc_in, in, op_mod, opmod); - MLX5_SET(mad_ifc_in, in, port, port); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + MLX5_SET(mad_ifc_in, in, plane_index, port); + MLX5_SET(mad_ifc_in, in, port, + smi_to_native_portnum(dev, port)); + } else { + MLX5_SET(mad_ifc_in, in, port, port); + } data = MLX5_ADDR_OF(mad_ifc_in, in, mad); memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); - err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out); + err = mlx5_cmd_exec_inout(dev->mdev, mad_ifc, in, out); if (err) goto out; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 93a971a40d11..e5cd31270443 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -54,7 +54,7 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 3e43687a7f6f..ead836d159d3 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -69,7 +69,7 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, if (ignore_bkey || !in_wc) op_modifier |= 0x2; - return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, + return mlx5_cmd_mad_ifc(dev, in_mad, response_mad, op_modifier, port); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4fed8d1ed819..8a1a43700b45 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -282,6 +282,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_port *port; + if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + if (native_port_num) + *native_port_num = smi_to_native_portnum(ibdev, + ib_port_num); + return ibdev->mdev; + + } + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) { if (native_port_num) @@ -1347,6 +1355,9 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, /* props being zeroed by the caller, avoid zeroing it here */ + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + port = smi_to_native_portnum(dev, port); + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) goto out; @@ -1362,7 +1373,8 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, if (dev->num_plane) { props->port_cap_flags |= IB_PORT_SM_DISABLED; props->port_cap_flags &= ~IB_PORT_SM; - } + } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + props->port_cap_flags &= ~IB_PORT_CM_SUP; props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); @@ -2812,7 +2824,8 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) if (dev->num_plane) { dev->port_caps[port - 1].has_smi = false; continue; - } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) || + dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { dev->port_caps[port - 1].has_smi = true; continue; } @@ -3026,6 +3039,8 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA | RDMA_CORE_CAP_AF_IB; + else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI; if (ll == IB_LINK_LAYER_INFINIBAND) return ret | RDMA_CORE_PORT_IBA_IB; @@ -3062,6 +3077,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num, return err; if (ll == IB_LINK_LAYER_INFINIBAND) { + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + port_num = smi_to_native_portnum(dev, port_num); + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, &rep); if (err) @@ -3862,12 +3880,18 @@ err_mp: return err; } +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev); + static const struct ib_device_ops mlx5_ib_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX5, .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, .add_gid = mlx5_ib_add_gid, + .add_sub_dev = mlx5_ib_add_sub_dev, .alloc_mr = mlx5_ib_alloc_mr, .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, @@ -3882,6 +3906,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .dealloc_pd = mlx5_ib_dealloc_pd, .dealloc_ucontext = mlx5_ib_dealloc_ucontext, .del_gid = mlx5_ib_del_gid, + .del_sub_dev = mlx5_ib_del_sub_dev, .dereg_mr = mlx5_ib_dereg_mr, .destroy_ah = mlx5_ib_destroy_ah, .destroy_cq = mlx5_ib_destroy_cq, @@ -4171,7 +4196,9 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; - if (!mlx5_lag_is_active(dev->mdev)) + if (dev->sub_dev_name) + name = dev->sub_dev_name; + else if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else name = "mlx5_bond_%d"; @@ -4432,6 +4459,89 @@ const struct mlx5_ib_profile raw_eth_profile = { NULL), }; +static const struct mlx5_ib_profile plane_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), +}; + +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane; + enum rdma_link_layer ll; + int ret; + + if (mparent->smi_dev) + return ERR_PTR(-EEXIST); + + ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev, + port_type)); + if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane || + ll != IB_LINK_LAYER_INFINIBAND || + !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) + return ERR_PTR(-EOPNOTSUPP); + + mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + if (!mplane) + return ERR_PTR(-ENOMEM); + + mplane->port = kcalloc(mparent->num_plane * mparent->num_ports, + sizeof(*mplane->port), GFP_KERNEL); + if (!mplane->port) { + ret = -ENOMEM; + goto fail_kcalloc; + } + + mplane->ib_dev.type = type; + mplane->mdev = mparent->mdev; + mplane->num_ports = mparent->num_plane; + mplane->sub_dev_name = name; + + ret = __mlx5_ib_add(mplane, &plane_profile); + if (ret) + goto fail_ib_add; + + mparent->smi_dev = mplane; + return &mplane->ib_dev; + +fail_ib_add: + kfree(mplane->port); +fail_kcalloc: + ib_dealloc_device(&mplane->ib_dev); + return ERR_PTR(ret); +} + +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev) +{ + struct mlx5_ib_dev *mdev = to_mdev(sub_dev); + + to_mdev(sub_dev->parent)->smi_dev = NULL; + __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX); +} + static int mlx5r_mp_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 49a5eebe69b8..d5eb1b726675 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1191,6 +1191,8 @@ struct mlx5_ib_dev { #endif u8 num_plane; + struct mlx5_ib_dev *smi_dev; + const char *sub_dev_name; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1699,4 +1701,10 @@ static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev, int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr); + +static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) +{ + return (port - 1) / dev->num_ports + 1; +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 079ad927182f..e39b1a101e97 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4217,7 +4217,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, /* todo implement counter_index functionality */ - if (is_sqp(qp->type)) + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI && is_qp0(qp->type)) { + MLX5_SET(ads, pri_path, vhca_port_num, + smi_to_native_portnum(dev, qp->port)); + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) + MLX5_SET(ads, pri_path, plane_index, qp->port); + } else if (is_sqp(qp->type)) MLX5_SET(ads, pri_path, vhca_port_num, qp->port); if (attr_mask & IB_QP_PORT) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d9cf6982d645..d3dcc272200a 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -249,7 +249,8 @@ int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, if (err) goto err_cmd; - mlx5_debug_qp_add(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_add(dev->mdev, qp); return 0; @@ -307,7 +308,8 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; - mlx5_debug_qp_remove(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_remove(dev->mdev, qp); destroy_resource_common(dev, qp); @@ -504,7 +506,9 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); xa_init(&table->dct_xa); - mlx5_qp_debugfs_init(dev->mdev); + + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; mlx5_notifier_register(dev->mdev, &table->nb); @@ -517,7 +521,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) struct mlx5_qp_table *table = &dev->qp_table; mlx5_notifier_unregister(dev->mdev, &table->nb); - mlx5_qp_debugfs_cleanup(dev->mdev); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_cleanup(dev->mdev); } int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, From 060c642b2ab8b40b39f9db99c1d14c7d19ba507f Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:40 +0300 Subject: [PATCH 44/65] RDMA/nldev: Add support to add/delete a sub IB device through netlink Add new netlink commands and attributes to support adding and deleting a sub IB device with admin privilege. Examples: $ rdma dev add smi1 type SMI parent ibp8s0f1 $ rdma dev del smi1 Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/77cbf1b36359642be8a8d8c5c2f4e585b544282f.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 59 ++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 6 ++++ 2 files changed, 65 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index bc79ee630d8d..b5f87e7a1cfd 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -167,6 +167,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2548,6 +2549,56 @@ err: return ret; } +static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + enum rdma_nl_dev_type type; + struct ib_device *parent; + char name[IFNAMSIZ] = {}; + u32 parentid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) + return -EINVAL; + + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); + type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); + parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + parent = ib_device_get_by_index(sock_net(skb->sk), parentid); + if (!parent) + return -EINVAL; + + ret = ib_add_sub_device(parent, type, name); + ib_device_put(parent); + + return ret; +} + +static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 devid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + return ib_del_sub_device_and_put(device); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2646,6 +2697,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { .doit = nldev_stat_get_counter_status_doit, }, + [RDMA_NLDEV_CMD_NEWDEV] = { + .doit = nldev_newdev, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELDEV] = { + .doit = nldev_deldev, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index d15ee16be722..bd52fb325e22 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -301,6 +301,10 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_SRQ_GET_RAW, + RDMA_NLDEV_CMD_NEWDEV, + + RDMA_NLDEV_CMD_DELDEV, + RDMA_NLDEV_NUM_OPS }; @@ -564,6 +568,8 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_RES_SUBTYPE, /* string */ + RDMA_NLDEV_ATTR_DEV_TYPE, /* u8 */ + /* * Always the end */ From 294424839b5ec2ecd17f4c8409796846b2b8dd31 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:41 +0300 Subject: [PATCH 45/65] RDMA/nldev: Add support to dump device type and parent device if exists If a device has a specific type or a parent device, dump them as well. Example: $ rdma dev show smi1 3: smi1: node_type ca fw 20.38.1002 node_guid 9803:9b03:009f:d5ef sys_image_guid 9803:9b03:009f:d5ee type smi parent ibp8s0f1 Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/4c022e3e34b5de1254a3b367d502a362cdd0c53a.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 10 ++++++++++ include/uapi/rdma/rdma_netlink.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index b5f87e7a1cfd..025efce540a7 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -168,6 +168,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -302,6 +303,15 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) return -EMSGSIZE; + if (device->type && + nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) + return -EMSGSIZE; + + if (device->parent && + nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, + dev_name(&device->parent->dev))) + return -EMSGSIZE; + /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index bd52fb325e22..4b69242d7848 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -570,6 +570,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DEV_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_PARENT_NAME, /* string */ + /* * Always the end */ From 3b43399b297c52cfe4dfe0194b6ad89d52bf8c35 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:42 +0300 Subject: [PATCH 46/65] RDMA/mlx5: Add plane index support when querying PTYS registers Support the new "plane_ind" field when querying port PTYS registers. This is needed when querying the rate of a plane port. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/1f703c36306aa46917fcd88eadbb23b3e380d526.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 12 +++++++----- drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/port.c | 10 ++++++---- include/linux/mlx5/port.h | 5 +++-- 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8a1a43700b45..5de0d95d1f45 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -511,10 +511,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, */ if (dev->is_rep) err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - 1); + 1, 0); else err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - mdev_port_num); + mdev_port_num, 0); if (err) goto out; ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); @@ -1341,11 +1341,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *rep; + u8 vl_hw_cap, plane_index = 0; u16 max_mtu; u16 oper_mtu; int err; u16 ib_link_width_oper; - u8 vl_hw_cap; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (!rep) { @@ -1355,8 +1355,10 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, /* props being zeroed by the caller, avoid zeroing it here */ - if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) { + plane_index = port; port = smi_to_native_portnum(dev, port); + } err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) @@ -1388,7 +1390,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->port_cap_flags2 = rep->cap_mask2; err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, - &props->active_speed, port); + &props->active_speed, port, plane_index); if (err) goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index b4efc780e297..5f6a0605e4ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -41,7 +41,7 @@ void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status, *an_disable_cap = 0; *an_disable_admin = 0; - if (mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, 1)) + if (mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, 1, 0)) return; *an_status = MLX5_GET(ptys_reg, out, an_status); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 3320f12ba2db..f57e0184c12b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1195,7 +1195,7 @@ static int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv, bool ext; int err; - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1, 0); if (err) { netdev_err(priv->netdev, "%s: query port ptys failed: %d\n", __func__, err); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index 779d92b762d3..b8aadbea1312 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -215,7 +215,7 @@ static int mlx5i_get_link_ksettings(struct net_device *netdev, int speed, ret; ret = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, &ib_proto_oper, - 1); + 1, 0); if (ret) return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 7fba1c46e2ac..50931584132b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -144,11 +144,13 @@ int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps) EXPORT_SYMBOL_GPL(mlx5_set_port_caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, - int ptys_size, int proto_mask, u8 local_port) + int ptys_size, int proto_mask, + u8 local_port, u8 plane_index) { u32 in[MLX5_ST_SZ_DW(ptys_reg)] = {0}; MLX5_SET(ptys_reg, in, local_port, local_port); + MLX5_SET(ptys_reg, in, plane_ind, plane_index); MLX5_SET(ptys_reg, in, proto_mask, proto_mask); return mlx5_core_access_reg(dev, in, sizeof(in), ptys, ptys_size, MLX5_REG_PTYS, 0, 0); @@ -167,13 +169,13 @@ int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration) } int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, - u16 *proto_oper, u8 local_port) + u16 *proto_oper, u8 local_port, u8 plane_index) { u32 out[MLX5_ST_SZ_DW(ptys_reg)]; int err; err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, - local_port); + local_port, plane_index); if (err) return err; @@ -1114,7 +1116,7 @@ int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, if (!eproto) return -EINVAL; - err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port); + err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port, 0); if (err) return err; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 26092c78a985..e68d42b8ce65 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -155,10 +155,11 @@ struct mlx5_port_eth_proto { int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, - int ptys_size, int proto_mask, u8 local_port); + int ptys_size, int proto_mask, + u8 local_port, u8 plane_index); int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, - u16 *proto_oper, u8 local_port); + u16 *proto_oper, u8 local_port, u8 plane_index); void mlx5_toggle_port_link(struct mlx5_core_dev *dev); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); From c6b6677d85d47f5562020aa082d9b5f90738fdcd Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:43 +0300 Subject: [PATCH 47/65] net/mlx5: mlx5_ifc update for accessing ppcnt register of plane ports This patch adds new fields to support multi-plane and the extend port counters group. Actual support will be added in the next patch. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/70221cdd79aad0e21cbf385d9567e3ebffbc5137.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 47 +++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 61738990e399..5fea7b747607 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2651,6 +2651,46 @@ struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { u8 port_xmit_wait[0x20]; }; +struct mlx5_ifc_ib_ext_port_cntrs_grp_data_layout_bits { + u8 reserved_at_0[0x300]; + + u8 port_xmit_data_high[0x20]; + + u8 port_xmit_data_low[0x20]; + + u8 port_rcv_data_high[0x20]; + + u8 port_rcv_data_low[0x20]; + + u8 port_xmit_pkts_high[0x20]; + + u8 port_xmit_pkts_low[0x20]; + + u8 port_rcv_pkts_high[0x20]; + + u8 port_rcv_pkts_low[0x20]; + + u8 reserved_at_400[0x80]; + + u8 port_unicast_xmit_pkts_high[0x20]; + + u8 port_unicast_xmit_pkts_low[0x20]; + + u8 port_multicast_xmit_pkts_high[0x20]; + + u8 port_multicast_xmit_pkts_low[0x20]; + + u8 port_unicast_rcv_pkts_high[0x20]; + + u8 port_unicast_rcv_pkts_low[0x20]; + + u8 port_multicast_rcv_pkts_high[0x20]; + + u8 port_multicast_rcv_pkts_low[0x20]; + + u8 reserved_at_580[0x240]; +}; + struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits { u8 transmit_queue_high[0x20]; @@ -4543,6 +4583,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits eth_per_tc_prio_grp_data_layout; struct mlx5_ifc_eth_per_tc_congest_prio_grp_data_layout_bits eth_per_tc_congest_prio_grp_data_layout; struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; + struct mlx5_ifc_ib_ext_port_cntrs_grp_data_layout_bits ib_ext_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; u8 reserved_at_0[0x7c0]; @@ -9851,8 +9892,10 @@ struct mlx5_ifc_ppcnt_reg_bits { u8 grp[0x6]; u8 clr[0x1]; - u8 reserved_at_21[0x1c]; - u8 prio_tc[0x3]; + u8 reserved_at_21[0x13]; + u8 plane_ind[0x4]; + u8 reserved_at_38[0x3]; + u8 prio_tc[0x5]; union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set; }; From 7a2210a57d423cbdb9c5f2e8b12c65d7472ff147 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 16 Jun 2024 19:08:44 +0300 Subject: [PATCH 48/65] RDMA/mlx5: Support per-plane port IB counters by querying PPCNT register Supports per-plane port counters by querying PPCNT register with the "extended port counters" group, as the query_vport_counter command doesn't support plane ports. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/06ffb582d67159b7def4654c8272d3d6e8bd2f2f.1718553901.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mad.c | 67 +++++++++++++++++++++++++++----- include/linux/mlx5/device.h | 1 + 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index ead836d159d3..1b6c5e37d169 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -147,8 +147,39 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, - size_t sz) +static void pma_cnt_ext_assign_ppcnt(struct ib_pma_portcounters_ext *cnt_ext, + void *out) +{ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_GET_EXT_CNTR(counter_name) \ + MLX5_GET64(ib_ext_port_cntrs_grp_data_layout, \ + out_pma, counter_name##_high) + + cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_data) >> 2); + cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_data) >> 2); + + cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_pkts)); + cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_pkts)); + + cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_xmit_pkts)); + cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_rcv_pkts)); + + cnt_ext->port_multicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_xmit_pkts)); + cnt_ext->port_multicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_rcv_pkts)); +} + +static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, u8 plane_num, + void *out, size_t sz, bool ext) { u32 *in; int err; @@ -160,8 +191,14 @@ static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, } MLX5_SET(ppcnt_reg, in, local_port, port_num); + MLX5_SET(ppcnt_reg, in, plane_ind, plane_num); - MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + if (ext) + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP); + else + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP); err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); @@ -189,7 +226,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, mdev_port_num = 1; } if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 && - !mlx5_core_mp_enabled(mdev)) { + !mlx5_core_mp_enabled(mdev) && + dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) { /* set local port to one for Function-Per-Port HCA. */ mdev = dev->mdev; mdev_port_num = 1; @@ -208,7 +246,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); - int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + int sz = max(MLX5_ST_SZ_BYTES(query_vport_counter_out), + MLX5_ST_SZ_BYTES(ppcnt_reg)); out_cnt = kvzalloc(sz, GFP_KERNEL); if (!out_cnt) { @@ -216,10 +255,18 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, - out_cnt); - if (!err) - pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + err = query_ib_ppcnt(mdev, mdev_port_num, + port_num, out_cnt, sz, 1); + if (!err) + pma_cnt_ext_assign_ppcnt(pma_cnt_ext, out_cnt); + } else { + err = mlx5_core_query_vport_counter(mdev, 0, 0, + mdev_port_num, + out_cnt); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } } else { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); @@ -231,7 +278,7 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz); + err = query_ib_ppcnt(mdev, mdev_port_num, 0, out_cnt, sz, 0); if (!err) pma_cnt_assign(pma_cnt, out_cnt); } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index d7bb31d9a446..68bd1b4737ea 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1466,6 +1466,7 @@ enum { MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP = 0x13, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, + MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, }; enum { From 346d2fc606a85a728a02cb26ff8304f80b114b2d Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 1 Jul 2024 09:57:52 +0000 Subject: [PATCH 49/65] RDMA/efa: Add EFA 0xefa3 PCI ID Add support for 0xefa3 devices. Reviewed-by: Yonatan Nachum Reviewed-by: Yossi Leybovich Signed-off-by: Michael Margolin Link: https://lore.kernel.org/r/20240701095752.20246-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 99f9ac23c721..1a777791bea3 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -16,11 +16,13 @@ #define PCI_DEV_ID_EFA0_VF 0xefa0 #define PCI_DEV_ID_EFA1_VF 0xefa1 #define PCI_DEV_ID_EFA2_VF 0xefa2 +#define PCI_DEV_ID_EFA3_VF 0xefa3 static const struct pci_device_id efa_pci_tbl[] = { { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA3_VF) }, { } }; From 1b8ca05469315ef7636f898ff83890c39e3deeb8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jul 2024 15:42:28 +0300 Subject: [PATCH 50/65] RDMA/qib: Fix truncation compilation warnings in qib_init.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/infiniband/hw/qib/qib_init.c: In function ‘qib_init_one’: drivers/infiniband/hw/qib/qib_init.c:586:67: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size between 0 and 3 [-Werror=format-truncation=] 586 | snprintf(wq_name, sizeof(wq_name), "qib%d_%d", | ^~ In function ‘qib_create_workqueues’, inlined from ‘qib_init_one’ at drivers/infiniband/hw/qib/qib_init.c:1438:8: drivers/infiniband/hw/qib/qib_init.c:586:60: note: directive argument in the range [-2147483643, 254] 586 | snprintf(wq_name, sizeof(wq_name), "qib%d_%d", | ^~~~~~~~~~ drivers/infiniband/hw/qib/qib_init.c:586:25: note: ‘snprintf’ output between 7 and 27 bytes into a destination of size 8 586 | snprintf(wq_name, sizeof(wq_name), "qib%d_%d", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 587 | dd->unit, pidx); | ~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Link: https://lore.kernel.org/r/ab5222c414a01e9d2c5129ef26836aace9ee2aa5.1719837715.git.leon@kernel.org Acked-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 33667becd52b..db3b25c8433a 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -581,7 +581,7 @@ static int qib_create_workqueues(struct qib_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (!ppd->qib_wq) { - char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ + char wq_name[23]; snprintf(wq_name, sizeof(wq_name), "qib%d_%d", dd->unit, pidx); From f802078d3cb882b9be555fd903040388e29eae87 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jul 2024 15:42:29 +0300 Subject: [PATCH 51/65] RDMA/qib: Fix truncation compilation warnings in qib_verbs.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce nodename string size to fit IB_DEVICE_NODE_DESC_MAX. drivers/infiniband/hw/qib/qib_verbs.c: In function ‘qib_register_ib_device’: drivers/infiniband/hw/qib/qib_verbs.c:1554:40: error: ‘%s’ directive output may be truncated writing up to 64 bytes into a region of size 43 [-Werror=format-truncation=] 1554 | "Intel Infiniband HCA %s", init_utsname()->nodename); | ^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/qib/qib_verbs.c:1553:9: note: ‘snprintf’ output between 22 and 86 bytes into a destination of size 64 1553 | snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1554 | "Intel Infiniband HCA %s", init_utsname()->nodename); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Link: https://lore.kernel.org/r/1fb6393fa2e0702fef995834c3c7db972bbc4d06.1719837715.git.leon@kernel.org Acked-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 0080f0be72fe..5fcb41970ad9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1551,7 +1551,7 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->dev.parent = &dd->pcidev->dev; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), - "Intel Infiniband HCA %s", init_utsname()->nodename); + "Intel Infiniband HCA %.42s", init_utsname()->nodename); /* * Fill in rvt info object. From af48f95492dc1af36d9636a750ec492035c0ed7d Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 1 Jul 2024 15:40:48 +0300 Subject: [PATCH 52/65] RDMA/core: Introduce "name_assign_type" for an IB device The name_assign_type indicates how the name is provided. Currently these types are supported: - RDMA_NAME_ASSIGN_TYPE_UNKNOWN: Unknown or not set; - RDMA_NAME_ASSIGN_TYPE_USER: Name is provided by the user; The user-created sub device, rxe and siw device has this type. When filling nl device info, it is set in the new attribute RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE. User-space tools like udev "rdma_rename" could check this attribute to determine if this device needs to be renamed or not. Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/522591bef9a369cc8e5dcb77787e017bffee37fe.1719837610.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 5 +++++ drivers/infiniband/hw/mlx5/main.c | 5 +++-- drivers/infiniband/sw/rxe/rxe_net.c | 1 + drivers/infiniband/sw/siw/siw_main.c | 1 + include/rdma/ib_verbs.h | 8 ++++++++ include/uapi/rdma/rdma_netlink.h | 9 +++++++++ 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 025efce540a7..a6b80cdc96f7 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -169,6 +169,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -312,6 +313,10 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) dev_name(&device->parent->dev))) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, + device->name_assign_type)) + return -EMSGSIZE; + /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5de0d95d1f45..33513553379c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4198,9 +4198,10 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; - if (dev->sub_dev_name) + if (dev->sub_dev_name) { name = dev->sub_dev_name; - else if (!mlx5_lag_is_active(dev->mdev)) + ib_mark_name_assigned_by_user(&dev->ib_dev); + } else if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else name = "mlx5_bond_%d"; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index ca9a82e1c4c7..75d1407db52d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -537,6 +537,7 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) return -ENOMEM; rxe->ndev = ndev; + ib_mark_name_assigned_by_user(&rxe->ib_dev); err = rxe_add(rxe, ndev->mtu, ibdev_name); if (err) { diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 61ad8ca3d1a2..b2b54242aa69 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -485,6 +485,7 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) else sdev->state = IB_PORT_DOWN; + ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e09d4f09b602..6c5712ae559d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2794,6 +2794,8 @@ struct ib_device { enum rdma_nl_dev_type type; struct ib_device *parent; struct list_head subdev_list; + + enum rdma_nl_name_assign_type name_assign_type; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, @@ -4867,4 +4869,10 @@ int ib_add_sub_device(struct ib_device *parent, * Return 0 on success, an error code otherwise */ int ib_del_sub_device_and_put(struct ib_device *sub); + +static inline void ib_mark_name_assigned_by_user(struct ib_device *ibdev) +{ + ibdev->name_assign_type = RDMA_NAME_ASSIGN_TYPE_USER; +} + #endif /* IB_VERBS_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4b69242d7848..2f37568f5556 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -572,6 +572,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_PARENT_NAME, /* string */ + RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, /* u8 */ + /* * Always the end */ @@ -615,4 +617,11 @@ enum rdma_nl_counter_mask { enum rdma_nl_dev_type { RDMA_DEVICE_TYPE_SMI = 1, }; + +/* RDMA device name assignment types */ +enum rdma_nl_name_assign_type { + RDMA_NAME_ASSIGN_TYPE_UNKNOWN = 0, + RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */ +}; + #endif /* _UAPI_RDMA_NETLINK_H */ From b851268018f0d63cb2069b5589bf4dbee1c74287 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Wed, 10 Jul 2024 17:16:57 +0800 Subject: [PATCH 53/65] RDMA/ocrdma: Don't inline statistics functions Fix the problem of KASAN causing the stack frame size to increase drivers/infiniband/hw/ocrdma/ocrdma_stats.c:686:16: error: stack frame size (20664) exceeds limit (8192) in 'ocrdma_dbgfs_ops_read' [-Werror,-Wframe-larger-than] static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, ^ Some functions called by ocrdma_dbgfs_ops_read occupy a lot of stack space. Mark these functions as noinline_for_stack to prevent them from accumulating in ocrdma_dbgfs_ops_read. Signed-off-by: Peng Hao Link: https://lore.kernel.org/r/20240710091657.26291-1-flyingpeng@tencent.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 5f831e3bdbad..0834416cb3f8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -46,7 +46,7 @@ static struct dentry *ocrdma_dbgfs_dir; -static int ocrdma_add_stat(char *start, char *pcur, +static noinline_for_stack int ocrdma_add_stat(char *start, char *pcur, char *name, u64 count) { char buff[128] = {0}; @@ -99,7 +99,7 @@ void ocrdma_release_stats_resources(struct ocrdma_dev *dev) kfree(mem->debugfs_mem); } -static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_resource_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -216,7 +216,7 @@ static char *ocrdma_resource_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -284,7 +284,7 @@ static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev) rx_stats->roce_frame_bytes_hi))/4; } -static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -358,7 +358,7 @@ static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev) tx_stats->read_rsp_bytes_hi))/4; } -static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_wqe_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -391,7 +391,7 @@ static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_db_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -412,7 +412,7 @@ static char *ocrdma_db_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -438,7 +438,7 @@ static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -462,7 +462,7 @@ static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -480,7 +480,7 @@ static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -498,7 +498,7 @@ static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; From 6afa2c0bfb8ef69f65715ae059e5bd5f9bbaf03b Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 10 Jul 2024 21:36:58 +0800 Subject: [PATCH 54/65] RDMA/hns: Check atomic wr length 8 bytes is the only supported length of atomic. Add this check in set_rc_wqe(). Besides, stop processing WQEs and return from set_rc_wqe() if there is any error. Fixes: 384f88185112 ("RDMA/hns: Add atomic support") Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ef50cd03f489..bf7f966c89fc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -91,6 +91,8 @@ /* Configure to HW for PAGE_SIZE larger than 4KB */ #define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) +#define ATOMIC_WR_LEN 8 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230 diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4287818a737f..eb6052ee8938 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -591,11 +591,16 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + if (msg_len != ATOMIC_WR_LEN) + return -EINVAL; set_atomic_seg(wr, rc_sq_wqe, valid_num_sge); - else if (wr->opcode != IB_WR_REG_MR) + } else if (wr->opcode != IB_WR_REG_MR) { ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge); + if (ret) + return ret; + } /* * The pipeline can sequentially post all valid WQEs into WQ buffer, From 2fdf34038369c0a27811e7b4680662a14ada1d6b Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 10 Jul 2024 21:36:59 +0800 Subject: [PATCH 55/65] RDMA/hns: Fix soft lockup under heavy CEQE load CEQEs are handled in interrupt handler currently. This may cause the CPU core staying in interrupt context too long and lead to soft lockup under heavy load. Handle CEQEs in BH workqueue and set an upper limit for the number of CEQE handled by a single call of work handler. Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 89 ++++++++++++--------- 2 files changed, 54 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index bf7f966c89fc..3cc5a4ff404a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -717,6 +717,7 @@ struct hns_roce_eq { int shift; int event_type; int sub_type; + struct work_struct work; }; struct hns_roce_eq_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index eb6052ee8938..2f16554c96be 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -6140,33 +6141,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_eq *eq) { - struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - irqreturn_t ceqe_found = IRQ_NONE; - u32 cqn; + queue_work(system_bh_wq, &eq->work); - while (ceqe) { - /* Make sure we read CEQ entry after we have checked the - * ownership bit - */ - dma_rmb(); - - cqn = hr_reg_read(ceqe, CEQE_CQN); - - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqe_found = IRQ_HANDLED; - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); - - ceqe = next_ceqe_sw_v2(eq); - } - - update_eq_db(eq); - - return IRQ_RETVAL(ceqe_found); + return IRQ_HANDLED; } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) @@ -6177,7 +6156,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ - int_work = hns_roce_v2_ceq_int(hr_dev, eq); + int_work = hns_roce_v2_ceq_int(eq); else /* Asynchronous event interrupt */ int_work = hns_roce_v2_aeq_int(hr_dev, eq); @@ -6545,6 +6524,34 @@ free_cmd_mbox: return ret; } +static void hns_roce_ceq_work(struct work_struct *work) +{ + struct hns_roce_eq *eq = from_work(eq, work, work); + struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); + struct hns_roce_dev *hr_dev = eq->hr_dev; + int ceqe_num = 0; + u32 cqn; + + while (ceqe && ceqe_num < hr_dev->caps.ceqe_depth) { + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = hr_reg_read(ceqe, CEQE_CQN); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ++ceqe_num; + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); + + ceqe = next_ceqe_sw_v2(eq); + } + + update_eq_db(eq); +} + static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, int comp_num, int aeq_num, int other_num) { @@ -6576,21 +6583,24 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, j - other_num - aeq_num); for (j = 0; j < irq_num; j++) { - if (j < other_num) + if (j < other_num) { ret = request_irq(hr_dev->irq[j], hns_roce_v2_msix_interrupt_abn, 0, hr_dev->irq_names[j], hr_dev); - - else if (j < (other_num + comp_num)) + } else if (j < (other_num + comp_num)) { + INIT_WORK(&eq_table->eq[j - other_num].work, + hns_roce_ceq_work); ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j + aeq_num], &eq_table->eq[j - other_num]); - else + } else { ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j - comp_num], &eq_table->eq[j - other_num]); + } + if (ret) { dev_err(hr_dev->dev, "request irq error!\n"); goto err_request_failed; @@ -6600,12 +6610,16 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, return 0; err_request_failed: - for (j -= 1; j >= 0; j--) - if (j < other_num) + for (j -= 1; j >= 0; j--) { + if (j < other_num) { free_irq(hr_dev->irq[j], hr_dev); - else - free_irq(eq_table->eq[j - other_num].irq, - &eq_table->eq[j - other_num]); + continue; + } + free_irq(eq_table->eq[j - other_num].irq, + &eq_table->eq[j - other_num]); + if (j < other_num + comp_num) + cancel_work_sync(&eq_table->eq[j - other_num].work); + } err_kzalloc_failed: for (i -= 1; i >= 0; i--) @@ -6626,8 +6640,11 @@ static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev) for (i = 0; i < hr_dev->caps.num_other_vectors; i++) free_irq(hr_dev->irq[i], hr_dev); - for (i = 0; i < eq_num; i++) + for (i = 0; i < eq_num; i++) { free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]); + if (i < hr_dev->caps.num_comp_vectors) + cancel_work_sync(&hr_dev->eq_table.eq[i].work); + } for (i = 0; i < irq_num; i++) kfree(hr_dev->irq_names[i]); From 543fb987bd63ed27409b5dea3d3eec27b9c1eac9 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 10 Jul 2024 21:37:00 +0800 Subject: [PATCH 56/65] RDMA/hns: Fix unmatch exception handling when init eq table fails The hw ctx should be destroyed when init eq table fails. Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2f16554c96be..cbbc142afc1b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6368,9 +6368,16 @@ static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, enable_flag); } -static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) +static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) +{ + hns_roce_mtr_destroy(hr_dev, &eq->mtr); +} + +static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; + int eqn = eq->eqn; int ret; u8 cmd; @@ -6381,12 +6388,9 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) ret = hns_roce_destroy_hw_ctx(hr_dev, cmd, eqn & HNS_ROCE_V2_EQN_M); if (ret) - dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); -} + dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn); -static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - hns_roce_mtr_destroy(hr_dev, &eq->mtr); + free_eq_buf(hr_dev, eq); } static void init_eq_config(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) @@ -6733,7 +6737,7 @@ err_request_irq_fail: err_create_eq_fail: for (i -= 1; i >= 0; i--) - free_eq_buf(hr_dev, &eq_table->eq[i]); + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); return ret; @@ -6753,11 +6757,8 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) __hns_roce_free_irq(hr_dev); destroy_workqueue(hr_dev->irq_workq); - for (i = 0; i < eq_num; i++) { - hns_roce_v2_destroy_eqc(hr_dev, i); - - free_eq_buf(hr_dev, &eq_table->eq[i]); - } + for (i = 0; i < eq_num; i++) + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); } From d387d4b54eb84208bd4ca13572e106851d0a0819 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 10 Jul 2024 21:37:01 +0800 Subject: [PATCH 57/65] RDMA/hns: Fix missing pagesize and alignment check in FRMR The offset requires 128B alignment and the page size ranges from 4K to 128M. Fixes: 68a997c5d28c ("RDMA/hns: Add FRMR support for hip08") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++++ drivers/infiniband/hw/hns/hns_roce_mr.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 3cc5a4ff404a..0b1e21cb6d2d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -83,6 +83,7 @@ #define MR_TYPE_DMA 0x03 #define HNS_ROCE_FRMR_MAX_PA 512 +#define HNS_ROCE_FRMR_ALIGN_SIZE 128 #define PKEY_ID 0xffff #define NODE_DESC_SIZE 64 @@ -189,6 +190,9 @@ enum { #define HNS_HW_PAGE_SHIFT 12 #define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) +#define HNS_HW_MAX_PAGE_SHIFT 27 +#define HNS_HW_MAX_PAGE_SIZE (1 << HNS_HW_MAX_PAGE_SHIFT) + struct hns_roce_uar { u64 pfn; unsigned long index; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 1a61dceb3319..846da8c78b8b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -443,6 +443,11 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, struct hns_roce_mtr *mtr = &mr->pbl_mtr; int ret, sg_num = 0; + if (!IS_ALIGNED(*sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) || + ibmr->page_size < HNS_HW_PAGE_SIZE || + ibmr->page_size > HNS_HW_MAX_PAGE_SIZE) + return sg_num; + mr->npages = 0; mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count, sizeof(dma_addr_t), GFP_KERNEL); From 24c6291346d98c7ece4f4bfeb5733bec1d6c7b4f Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 10 Jul 2024 21:37:02 +0800 Subject: [PATCH 58/65] RDMA/hns: Fix shift-out-bounds when max_inline_data is 0 A shift-out-bounds may occur, if the max_inline_data has not been set. The related log: UBSAN: shift-out-of-bounds in kernel/include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' Call trace: dump_backtrace+0xb0/0x118 show_stack+0x20/0x38 dump_stack_lvl+0xbc/0x120 dump_stack+0x1c/0x28 __ubsan_handle_shift_out_of_bounds+0x104/0x240 set_ext_sge_param+0x40c/0x420 [hns_roce_hw_v2] hns_roce_create_qp+0xf48/0x1c40 [hns_roce_hw_v2] create_qp.part.0+0x294/0x3c0 ib_create_qp_kernel+0x7c/0x150 create_mad_qp+0x11c/0x1e0 ib_mad_init_device+0x834/0xc88 add_client_context+0x248/0x318 enable_device_and_get+0x158/0x280 ib_register_device+0x4ac/0x610 hns_roce_init+0x890/0xf98 [hns_roce_hw_v2] __hns_roce_hw_v2_init_instance+0x398/0x720 [hns_roce_hw_v2] hns_roce_hw_v2_init_instance+0x108/0x1e0 [hns_roce_hw_v2] hclge_init_roce_client_instance+0x1a0/0x358 [hclge] hclge_init_client_instance+0xa0/0x508 [hclge] hnae3_register_client+0x18c/0x210 [hnae3] hns_roce_hw_v2_init+0x28/0xff8 [hns_roce_hw_v2] do_one_initcall+0xe0/0x510 do_init_module+0x110/0x370 load_module+0x2c6c/0x2f20 init_module_from_file+0xe0/0x140 idempotent_init_module+0x24c/0x350 __arm64_sys_finit_module+0x88/0xf8 invoke_syscall+0x68/0x1a0 el0_svc_common.constprop.0+0x11c/0x150 do_el0_svc+0x38/0x50 el0_svc+0x50/0xa0 el0t_64_sync_handler+0xc0/0xc8 el0t_64_sync+0x1a4/0x1a8 Fixes: 0c5e259b06a8 ("RDMA/hns: Fix incorrect sge nums calculation") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index db34665d1dfb..1de384ce4d0e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -532,13 +532,15 @@ static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, { unsigned int inline_sge; - inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + if (!max_inline_data) + return 0; /* * if max_inline_data less than * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, * In addition to ud's mode, no need to extend sge. */ + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) inline_sge = 0; From 36397b907355e2fdb5a25a02a7921a937fd8ef4c Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 10 Jul 2024 21:37:03 +0800 Subject: [PATCH 59/65] RDMA/hns: Fix undifined behavior caused by invalid max_sge If max_sge has been set to 0, roundup_pow_of_two() in set_srq_basic_param() may have undefined behavior. Fixes: 9dd052474a26 ("RDMA/hns: Allocate one more recv SGE for HIP08") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-7-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index f1997abc97ca..c9b8233f4b05 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -297,7 +297,7 @@ static int set_srq_basic_param(struct hns_roce_srq *srq, max_sge = proc_srq_sge(hr_dev, srq, !!udata); if (attr->max_wr > hr_dev->caps.max_srq_wrs || - attr->max_sge > max_sge) { + attr->max_sge > max_sge || !attr->max_sge) { ibdev_err(&hr_dev->ib_dev, "invalid SRQ attr, depth = %u, sge = %u.\n", attr->max_wr, attr->max_sge); From 0b8e658f70ffd5dc7cda3872fd524d657d4796b7 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 10 Jul 2024 21:37:04 +0800 Subject: [PATCH 60/65] RDMA/hns: Fix insufficient extend DB for VFs. VFs and its PF will share the memory of the extend DB. Currently, the number of extend DB allocated by driver is only enough for PF. This leads to a probability of DB loss and some other problems in scenarios where both PF and VFs use a large number of QPs. Fixes: 6b63597d3540 ("RDMA/hns: Add TSQ link table support") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-8-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index cbbc142afc1b..aecd137c1e60 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2463,14 +2463,16 @@ static int set_llm_cfg_to_hw(struct hns_roce_dev *hr_dev, static struct hns_roce_link_table * alloc_link_table_buf(struct hns_roce_dev *hr_dev) { + u16 total_sl = hr_dev->caps.sl_num * hr_dev->func_num; struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_link_table *link_tbl; u32 pg_shift, size, min_size; link_tbl = &priv->ext_llm; pg_shift = hr_dev->caps.llm_buf_pg_sz + PAGE_SHIFT; - size = hr_dev->caps.num_qps * HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; - min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(hr_dev->caps.sl_num) << pg_shift; + size = hr_dev->caps.num_qps * hr_dev->func_num * + HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; + min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(total_sl) << pg_shift; /* Alloc data table */ size = max(size, min_size); From bbddfa2255dd0800209697fd12378e02ed05f833 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 10 Jul 2024 21:37:05 +0800 Subject: [PATCH 61/65] RDMA/hns: Fix mbx timing out before CMD execution is completed When a large number of tasks are issued, the speed of HW processing mbx will slow down. The standard for judging mbx timeout in the current firmware is 30ms, and the current timeout standard for the driver is also 30ms. Considering that firmware scheduling in multi-function scenarios takes a certain amount of time, this will cause the driver to time out too early and report a failure before mbx execution times out. This patch introduces a new mechanism that can set different timeouts for different cmds and extends the timeout of mbx to 35ms. Fixes: a04ff739f2a9 ("RDMA/hns: Add command queue support for hip08 RoCE driver") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240710133705.896445-9-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 35 +++++++++++++++++----- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 6 ++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index aecd137c1e60..621b057fb9da 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1275,12 +1275,38 @@ static int hns_roce_cmd_err_convert_errno(u16 desc_ret) return -EIO; } +static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout) +{ + static const struct hns_roce_cmdq_tx_timeout_map cmdq_tx_timeout[] = { + {HNS_ROCE_OPC_POST_MB, HNS_ROCE_OPC_POST_MB_TIMEOUT}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(cmdq_tx_timeout); i++) + if (cmdq_tx_timeout[i].opcode == opcode) + return cmdq_tx_timeout[i].tx_timeout; + + return tx_timeout; +} + +static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); + u32 timeout = 0; + + do { + if (hns_roce_cmq_csq_done(hr_dev)) + break; + udelay(1); + } while (++timeout < tx_timeout); +} + static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; - u32 timeout = 0; u16 desc_ret; u32 tail; int ret; @@ -1301,12 +1327,7 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]); - do { - if (hns_roce_cmq_csq_done(hr_dev)) - break; - udelay(1); - } while (++timeout < priv->cmq.tx_timeout); - + hns_roce_wait_csq_done(hr_dev, le16_to_cpu(desc->opcode)); if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index def1d15a03c7..c65f68a14a26 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -224,6 +224,12 @@ enum hns_roce_opcode_type { HNS_SWITCH_PARAMETER_CFG = 0x1033, }; +#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 +struct hns_roce_cmdq_tx_timeout_map { + u16 opcode; + u32 tx_timeout; +}; + enum { TYPE_CRQ, TYPE_CSQ, From 2043a14fb3de9d88440b21590f714306fcbbd55f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 10 Jul 2024 13:33:10 -0700 Subject: [PATCH 62/65] RDMA: Fix netdev tracker in ib_device_set_netdev If a netdev has already been assigned, ib_device_set_netdev needs to release the reference on the older netdev but it is mistakenly being called for the new netdev. Fix it and in the process use netdev_put to be symmetrical with the netdev_hold. Fixes: 09f530f0c6d6 ("RDMA: Add netdevice_tracker to ib_device_set_netdev()") Signed-off-by: David Ahern Link: https://lore.kernel.org/r/20240710203310.19317-1-dsahern@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7b418c717f29..0290aca18d26 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2184,16 +2184,12 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, return 0; } - if (old_ndev) - netdev_tracker_free(ndev, &pdata->netdev_tracker); - if (ndev) - netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); + netdev_put(old_ndev, &pdata->netdev_tracker); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); - __dev_put(old_ndev); - return 0; } EXPORT_SYMBOL(ib_device_set_netdev); From 95b087f87b780daafad1dbb2c84e81b729d5d33f Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 10 Jul 2024 14:21:02 +0200 Subject: [PATCH 63/65] bnxt_re: Fix imm_data endianness When map a device between servers with MLX and BCM RoCE nics, RTRS server complain about unknown imm type, and can't map the device, After more debug, it seems bnxt_re wrongly handle the imm_data, this patch fixed the compat issue with MLX for us. In off list discussion, Selvin confirmed HW is working in little endian format and all data needs to be converted to LE while providing. This patch fix the endianness for imm_data Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20240710122102.37569-1-jinpu.wang@ionos.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 ++++---- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e453ca701e87..7c757351a016 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2479,7 +2479,7 @@ static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, break; case IB_WR_SEND_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM; - wqe->send.imm_data = wr->ex.imm_data; + wqe->send.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_SEND_WITH_INV: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV; @@ -2509,7 +2509,7 @@ static int bnxt_re_build_rdma_wqe(const struct ib_send_wr *wr, break; case IB_WR_RDMA_WRITE_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM; - wqe->rdma.imm_data = wr->ex.imm_data; + wqe->rdma.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_RDMA_READ: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ; @@ -3582,7 +3582,7 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp, wc->byte_len = orig_cqe->length; wc->qp = &gsi_qp->ib_qp; - wc->ex.imm_data = orig_cqe->immdata; + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(orig_cqe->immdata)); wc->src_qp = orig_cqe->src_qp; memcpy(wc->smac, orig_cqe->smac, ETH_ALEN); if (bnxt_re_is_vlan_pkt(orig_cqe, &vlan_id, &sl)) { @@ -3727,7 +3727,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) (unsigned long)(cqe->qp_handle), struct bnxt_re_qp, qplib_qp); wc->qp = &qp->ib_qp; - wc->ex.imm_data = cqe->immdata; + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->immdata)); wc->src_qp = cqe->src_qp; memcpy(wc->smac, cqe->smac, ETH_ALEN); wc->port_num = 1; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 4aaac84c1b1b..56538b90d6c5 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -164,7 +164,7 @@ struct bnxt_qplib_swqe { /* Send, with imm, inval key */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u32 q_key; @@ -182,7 +182,7 @@ struct bnxt_qplib_swqe { /* RDMA write, with imm, read */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u64 remote_va; @@ -389,7 +389,7 @@ struct bnxt_qplib_cqe { u16 cfa_meta; u64 wr_id; union { - __be32 immdata; + __le32 immdata; u32 invrkey; }; u64 qp_handle; From 1df03a4b44146c4f720d793915747272c7773a3e Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 11 Jul 2024 06:37:57 -0700 Subject: [PATCH 64/65] RDMA/mana_ib: Set correct device into ib Add mana_get_primary_netdev_rcu helper to get a primary netdevice for a given port. When mana is used with netvsc, the VF netdev is controlled by an upper netvsc device. In a baremetal case, the VF netdev is the primary device. Use the mana_get_primary_netdev_rcu() helper in the mana_ib to get the correct device for querying network states. Fixes: 8b184e4f1c32 ("RDMA/mana_ib: Enable RoCE on port 1") Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1720705077-322-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 16 ++++++++-------- drivers/net/ethernet/microsoft/mana/mana_en.c | 19 +++++++++++++++++++ include/net/mana/mana.h | 2 ++ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index b07a8e2e838f..7ac01918ef7c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -56,7 +56,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, { struct mana_adev *madev = container_of(adev, struct mana_adev, adev); struct gdma_dev *mdev = madev->mdev; - struct net_device *upper_ndev; + struct net_device *ndev; struct mana_context *mc; struct mana_ib_dev *dev; u8 mac_addr[ETH_ALEN]; @@ -84,17 +84,17 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; - rcu_read_lock(); /* required to get upper dev */ - upper_ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]); - if (!upper_ndev) { + rcu_read_lock(); /* required to get primary netdev */ + ndev = mana_get_primary_netdev_rcu(mc, 0); + if (!ndev) { rcu_read_unlock(); ret = -ENODEV; - ibdev_err(&dev->ib_dev, "Failed to get master netdev"); + ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); goto free_ib_device; } - ether_addr_copy(mac_addr, upper_ndev->dev_addr); - addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, upper_ndev->dev_addr); - ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1); + ether_addr_copy(mac_addr, ndev->dev_addr); + addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); + ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); rcu_read_unlock(); if (ret) { ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index b89ad4afd66e..68c2bea2c022 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -3007,3 +3007,22 @@ out: gd->gdma_context = NULL; kfree(ac); } + +struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index) +{ + struct net_device *ndev; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "Taking primary netdev without holding the RCU read lock"); + if (port_index >= ac->num_ports) + return NULL; + + /* When mana is used in netvsc, the upper netdevice should be returned. */ + if (ac->ports[port_index]->flags & IFF_SLAVE) + ndev = netdev_master_upper_dev_get_rcu(ac->ports[port_index]); + else + ndev = ac->ports[port_index]; + + return ndev; +} +EXPORT_SYMBOL_NS(mana_get_primary_netdev_rcu, NET_MANA); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 59823901b74f..f9b4b0dcb69f 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -797,4 +797,6 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); + +struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index); #endif /* _MANA_H */ From 887cd308fd46a1c6956e9ccda1aaca830edc8ed7 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jul 2024 14:15:25 +0200 Subject: [PATCH 65/65] IB/hfi1: Constify struct flag_table 'struct flag_table' are not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 302932 40271 112 343315 53d13 drivers/infiniband/hw/hfi1/chip.o After: ===== text data bss dec hex filename 311636 31567 112 343315 53d13 drivers/infiniband/hw/hfi1/chip.o Link: https://lore.kernel.org/r/782b6a648bfbbf2bb83f81a73c0460b5bb7642a1.1720959310.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 78f27f7b4203..c52e6b2c9914 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -251,7 +251,7 @@ struct flag_table { /* * CCE Error flags. */ -static struct flag_table cce_err_status_flags[] = { +static const struct flag_table cce_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CceCsrParityErr", CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), /* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", @@ -341,7 +341,7 @@ static struct flag_table cce_err_status_flags[] = { * Misc Error flags */ #define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK -static struct flag_table misc_err_status_flags[] = { +static const struct flag_table misc_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), @@ -360,7 +360,7 @@ static struct flag_table misc_err_status_flags[] = { /* * TXE PIO Error flags and consequences */ -static struct flag_table pio_err_status_flags[] = { +static const struct flag_table pio_err_status_flags[] = { /* 0*/ FLAG_ENTRY("PioWriteBadCtxt", SEC_WRITE_DROPPED, SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), @@ -502,7 +502,7 @@ static struct flag_table pio_err_status_flags[] = { /* * TXE SDMA Error flags */ -static struct flag_table sdma_err_status_flags[] = { +static const struct flag_table sdma_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), /* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", @@ -530,7 +530,7 @@ static struct flag_table sdma_err_status_flags[] = { * TXE Egress Error flags */ #define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK -static struct flag_table egress_err_status_flags[] = { +static const struct flag_table egress_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), /* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), /* 2 reserved */ @@ -631,7 +631,7 @@ static struct flag_table egress_err_status_flags[] = { * TXE Egress Error Info flags */ #define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK -static struct flag_table egress_err_info_flags[] = { +static const struct flag_table egress_err_info_flags[] = { /* 0*/ FLAG_ENTRY0("Reserved", 0ull), /* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), /* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), @@ -680,7 +680,7 @@ static struct flag_table egress_err_info_flags[] = { * TXE Send error flags */ #define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK -static struct flag_table send_err_status_flags[] = { +static const struct flag_table send_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) @@ -689,7 +689,7 @@ static struct flag_table send_err_status_flags[] = { /* * TXE Send Context Error flags and consequences */ -static struct flag_table sc_err_status_flags[] = { +static const struct flag_table sc_err_status_flags[] = { /* 0*/ FLAG_ENTRY("InconsistentSop", SEC_PACKET_DROPPED | SEC_SC_HALTED, SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), @@ -712,7 +712,7 @@ static struct flag_table sc_err_status_flags[] = { * RXE Receive Error flags */ #define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK -static struct flag_table rxe_err_status_flags[] = { +static const struct flag_table rxe_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), /* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), /* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), @@ -847,7 +847,7 @@ static struct flag_table rxe_err_status_flags[] = { * DCC Error Flags */ #define DCCE(name) DCC_ERR_FLG_##name##_SMASK -static struct flag_table dcc_err_flags[] = { +static const struct flag_table dcc_err_flags[] = { FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), @@ -900,7 +900,7 @@ static struct flag_table dcc_err_flags[] = { * LCB error flags */ #define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK -static struct flag_table lcb_err_flags[] = { +static const struct flag_table lcb_err_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), /* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), /* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), @@ -943,7 +943,7 @@ static struct flag_table lcb_err_flags[] = { * DC8051 Error Flags */ #define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK -static struct flag_table dc8051_err_flags[] = { +static const struct flag_table dc8051_err_flags[] = { FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), @@ -962,7 +962,7 @@ static struct flag_table dc8051_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. */ -static struct flag_table dc8051_info_err_flags[] = { +static const struct flag_table dc8051_info_err_flags[] = { FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), @@ -986,7 +986,7 @@ static struct flag_table dc8051_info_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. */ -static struct flag_table dc8051_info_host_msg_flags[] = { +static const struct flag_table dc8051_info_host_msg_flags[] = { FLAG_ENTRY0("Host request done", 0x0001), FLAG_ENTRY0("BC PWR_MGM message", 0x0002), FLAG_ENTRY0("BC SMA message", 0x0004), @@ -5275,7 +5275,7 @@ done: * the buffer. End in '*' if the buffer is too short. */ static char *flag_string(char *buf, int buf_len, u64 flags, - struct flag_table *table, int table_size) + const struct flag_table *table, int table_size) { char extra[32]; char *p = buf;