linux/drivers/infiniband/hw/hfi1/ipoib_main.c

265 lines
5.9 KiB
C
Raw Normal View History

// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* Copyright(c) 2020 Intel Corporation.
*
*/
/*
* This file contains HFI1 support for ipoib functionality
*/
#include "ipoib.h"
#include "hfi.h"
static u32 qpn_from_mac(const u8 *mac_arr)
{
return (u32)mac_arr[1] << 16 | mac_arr[2] << 8 | mac_arr[3];
}
static int hfi1_ipoib_dev_init(struct net_device *dev)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
int ret;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
ret = priv->netdev_ops->ndo_init(dev);
if (ret)
goto out_ret;
ret = hfi1_netdev_add_data(priv->dd,
qpn_from_mac(priv->netdev->dev_addr),
dev);
if (ret < 0) {
priv->netdev_ops->ndo_uninit(dev);
goto out_ret;
}
return 0;
out_ret:
free_percpu(dev->tstats);
dev->tstats = NULL;
return ret;
}
static void hfi1_ipoib_dev_uninit(struct net_device *dev)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
free_percpu(dev->tstats);
dev->tstats = NULL;
hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr));
priv->netdev_ops->ndo_uninit(dev);
}
static int hfi1_ipoib_dev_open(struct net_device *dev)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
int ret;
ret = priv->netdev_ops->ndo_open(dev);
if (!ret) {
struct hfi1_ibport *ibp = to_iport(priv->device,
priv->port_num);
struct rvt_qp *qp;
u32 qpn = qpn_from_mac(priv->netdev->dev_addr);
rcu_read_lock();
qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
if (!qp) {
rcu_read_unlock();
priv->netdev_ops->ndo_stop(dev);
return -EINVAL;
}
rvt_get_qp(qp);
priv->qp = qp;
rcu_read_unlock();
hfi1_netdev_enable_queues(priv->dd);
hfi1_ipoib_napi_tx_enable(dev);
}
return ret;
}
static int hfi1_ipoib_dev_stop(struct net_device *dev)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
if (!priv->qp)
return 0;
hfi1_ipoib_napi_tx_disable(dev);
hfi1_netdev_disable_queues(priv->dd);
rvt_put_qp(priv->qp);
priv->qp = NULL;
return priv->netdev_ops->ndo_stop(dev);
}
static const struct net_device_ops hfi1_ipoib_netdev_ops = {
.ndo_init = hfi1_ipoib_dev_init,
.ndo_uninit = hfi1_ipoib_dev_uninit,
.ndo_open = hfi1_ipoib_dev_open,
.ndo_stop = hfi1_ipoib_dev_stop,
.ndo_get_stats64 = dev_get_tstats64,
};
static int hfi1_ipoib_mcast_attach(struct net_device *dev,
struct ib_device *device,
union ib_gid *mgid,
u16 mlid,
int set_qkey,
u32 qkey)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr);
struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num);
struct rvt_qp *qp;
int ret = -EINVAL;
rcu_read_lock();
qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
if (qp) {
rvt_get_qp(qp);
rcu_read_unlock();
if (set_qkey)
priv->qkey = qkey;
/* attach QP to multicast group */
ret = ib_attach_mcast(&qp->ibqp, mgid, mlid);
rvt_put_qp(qp);
} else {
rcu_read_unlock();
}
return ret;
}
static int hfi1_ipoib_mcast_detach(struct net_device *dev,
struct ib_device *device,
union ib_gid *mgid,
u16 mlid)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr);
struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num);
struct rvt_qp *qp;
int ret = -EINVAL;
rcu_read_lock();
qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
if (qp) {
rvt_get_qp(qp);
rcu_read_unlock();
ret = ib_detach_mcast(&qp->ibqp, mgid, mlid);
rvt_put_qp(qp);
} else {
rcu_read_unlock();
}
return ret;
}
static void hfi1_ipoib_netdev_dtor(struct net_device *dev)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
hfi1_ipoib_txreq_deinit(priv);
hfi1_ipoib_rxq_deinit(priv->netdev);
free_percpu(dev->tstats);
dev->tstats = NULL;
}
static void hfi1_ipoib_set_id(struct net_device *dev, int id)
{
struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
priv->pkey_index = (u16)id;
ib_query_pkey(priv->device,
priv->port_num,
priv->pkey_index,
&priv->pkey);
}
static int hfi1_ipoib_setup_rn(struct ib_device *device,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num,
struct net_device *netdev,
void *param)
{
struct hfi1_devdata *dd = dd_from_ibdev(device);
struct rdma_netdev *rn = netdev_priv(netdev);
struct hfi1_ipoib_dev_priv *priv;
int rc;
rn->send = hfi1_ipoib_send;
rn->tx_timeout = hfi1_ipoib_tx_timeout;
rn->attach_mcast = hfi1_ipoib_mcast_attach;
rn->detach_mcast = hfi1_ipoib_mcast_detach;
rn->set_id = hfi1_ipoib_set_id;
rn->hca = device;
rn->port_num = port_num;
rn->mtu = netdev->mtu;
priv = hfi1_ipoib_priv(netdev);
priv->dd = dd;
priv->netdev = netdev;
priv->device = device;
priv->port_num = port_num;
priv->netdev_ops = netdev->netdev_ops;
ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey);
rc = hfi1_ipoib_txreq_init(priv);
if (rc) {
dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc);
return rc;
}
rc = hfi1_ipoib_rxq_init(netdev);
if (rc) {
dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc);
IB/hfi1: Fix AIP early init panic An early failure in hfi1_ipoib_setup_rn() can lead to the following panic: BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI Workqueue: events work_for_cpu_fn RIP: 0010:try_to_grab_pending+0x2b/0x140 Code: 1f 44 00 00 41 55 41 54 55 48 89 d5 53 48 89 fb 9c 58 0f 1f 44 00 00 48 89 c2 fa 66 0f 1f 44 00 00 48 89 55 00 40 84 f6 75 77 <f0> 48 0f ba 2b 00 72 09 31 c0 5b 5d 41 5c 41 5d c3 48 89 df e8 6c RSP: 0018:ffffb6b3cf7cfa48 EFLAGS: 00010046 RAX: 0000000000000246 RBX: 00000000000001b0 RCX: 0000000000000000 RDX: 0000000000000246 RSI: 0000000000000000 RDI: 00000000000001b0 RBP: ffffb6b3cf7cfa70 R08: 0000000000000f09 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffffb6b3cf7cfa90 R14: ffffffff9b2fbfc0 R15: ffff8a4fdf244690 FS: 0000000000000000(0000) GS:ffff8a527f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001b0 CR3: 00000017e2410003 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __cancel_work_timer+0x42/0x190 ? dev_printk_emit+0x4e/0x70 iowait_cancel_work+0x15/0x30 [hfi1] hfi1_ipoib_txreq_deinit+0x5a/0x220 [hfi1] ? dev_err+0x6c/0x90 hfi1_ipoib_netdev_dtor+0x15/0x30 [hfi1] hfi1_ipoib_setup_rn+0x10e/0x150 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ? hfi1_ipoib_free_rdma_netdev+0x20/0x20 [hfi1] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] enable_device_and_get+0xdc/0x1d0 [ib_core] ib_register_device+0x572/0x6b0 [ib_core] rvt_register_device+0x11b/0x220 [rdmavt] hfi1_register_ib_device+0x6b4/0x770 [hfi1] do_init_one.isra.20+0x3e3/0x680 [hfi1] local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x1f/0x40 The panic happens in hfi1_ipoib_txreq_deinit() because there is a NULL deref when hfi1_ipoib_netdev_dtor() is called in this error case. hfi1_ipoib_txreq_init() and hfi1_ipoib_rxq_init() are self unwinding so fix by adjusting the error paths accordingly. Other changes: - hfi1_ipoib_free_rdma_netdev() is deleted including the free_netdev() since the netdev core code deletes calls free_netdev() - The switch to the accelerated entrances is moved to the success path. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-4-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2022-01-15 23:02:35 +00:00
hfi1_ipoib_txreq_deinit(priv);
return rc;
}
IB/hfi1: Fix AIP early init panic An early failure in hfi1_ipoib_setup_rn() can lead to the following panic: BUG: unable to handle kernel NULL pointer dereference at 00000000000001b0 PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI Workqueue: events work_for_cpu_fn RIP: 0010:try_to_grab_pending+0x2b/0x140 Code: 1f 44 00 00 41 55 41 54 55 48 89 d5 53 48 89 fb 9c 58 0f 1f 44 00 00 48 89 c2 fa 66 0f 1f 44 00 00 48 89 55 00 40 84 f6 75 77 <f0> 48 0f ba 2b 00 72 09 31 c0 5b 5d 41 5c 41 5d c3 48 89 df e8 6c RSP: 0018:ffffb6b3cf7cfa48 EFLAGS: 00010046 RAX: 0000000000000246 RBX: 00000000000001b0 RCX: 0000000000000000 RDX: 0000000000000246 RSI: 0000000000000000 RDI: 00000000000001b0 RBP: ffffb6b3cf7cfa70 R08: 0000000000000f09 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffffb6b3cf7cfa90 R14: ffffffff9b2fbfc0 R15: ffff8a4fdf244690 FS: 0000000000000000(0000) GS:ffff8a527f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001b0 CR3: 00000017e2410003 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __cancel_work_timer+0x42/0x190 ? dev_printk_emit+0x4e/0x70 iowait_cancel_work+0x15/0x30 [hfi1] hfi1_ipoib_txreq_deinit+0x5a/0x220 [hfi1] ? dev_err+0x6c/0x90 hfi1_ipoib_netdev_dtor+0x15/0x30 [hfi1] hfi1_ipoib_setup_rn+0x10e/0x150 [hfi1] rdma_init_netdev+0x5a/0x80 [ib_core] ? hfi1_ipoib_free_rdma_netdev+0x20/0x20 [hfi1] ipoib_intf_init+0x6c/0x350 [ib_ipoib] ipoib_intf_alloc+0x5c/0xc0 [ib_ipoib] ipoib_add_one+0xbe/0x300 [ib_ipoib] add_client_context+0x12c/0x1a0 [ib_core] enable_device_and_get+0xdc/0x1d0 [ib_core] ib_register_device+0x572/0x6b0 [ib_core] rvt_register_device+0x11b/0x220 [rdmavt] hfi1_register_ib_device+0x6b4/0x770 [hfi1] do_init_one.isra.20+0x3e3/0x680 [hfi1] local_pci_probe+0x41/0x90 work_for_cpu_fn+0x16/0x20 process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x1cf/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x1f/0x40 The panic happens in hfi1_ipoib_txreq_deinit() because there is a NULL deref when hfi1_ipoib_netdev_dtor() is called in this error case. hfi1_ipoib_txreq_init() and hfi1_ipoib_rxq_init() are self unwinding so fix by adjusting the error paths accordingly. Other changes: - hfi1_ipoib_free_rdma_netdev() is deleted including the free_netdev() since the netdev core code deletes calls free_netdev() - The switch to the accelerated entrances is moved to the success path. Cc: stable@vger.kernel.org Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/1642287756-182313-4-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2022-01-15 23:02:35 +00:00
netdev->netdev_ops = &hfi1_ipoib_netdev_ops;
netdev->priv_destructor = hfi1_ipoib_netdev_dtor;
netdev->needs_free_netdev = true;
return 0;
}
int hfi1_ipoib_rn_get_params(struct ib_device *device,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num,
enum rdma_netdev_t type,
struct rdma_netdev_alloc_params *params)
{
struct hfi1_devdata *dd = dd_from_ibdev(device);
if (type != RDMA_NETDEV_IPOIB)
return -EOPNOTSUPP;
if (!HFI1_CAP_IS_KSET(AIP) || !dd->num_netdev_contexts)
return -EOPNOTSUPP;
if (!port_num || port_num > dd->num_pports)
return -EINVAL;
params->sizeof_priv = sizeof(struct hfi1_ipoib_rdma_netdev);
params->txqs = dd->num_sdma;
params->rxqs = dd->num_netdev_contexts;
params->param = NULL;
params->initialize_rdma_netdev = hfi1_ipoib_setup_rn;
return 0;
}