From 4139032b4860c06ff3a7687041f06535fed901ed Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Mon, 29 Jun 2015 09:57:00 -0400 Subject: [PATCH 01/24] IB: Add rdma_cap_ib_switch helper and use where appropriate Persuant to Liran's comments on node_type on linux-rdma mailing list: In an effort to reform the RDMA core and ULPs to minimize use of node_type in struct ib_device, an additional bit is added to struct ib_device for is_switch (IB switch). This is needed to be initialized by any IB switch device driver. This is a NEW requirement on such device drivers which are all "out of tree". In addition, an ib_switch helper was added to ib_verbs.h based on the is_switch device bit rather than node_type (although those should be consistent). The RDMA core (MAD, SMI, agent, sa_query, multicast, sysfs) as well as (IPoIB and SRP) ULPs are updated where appropriate to use this new helper. In some cases, the helper is now used under the covers of using rdma_[start end]_port rather than the open coding previously used. Reviewed-by: Sean Hefty Reviewed-By: Jason Gunthorpe Reviewed-by: Ira Weiny Tested-by: Ira Weiny Signed-off-by: Hal Rosenstock Signed-off-by: Doug Ledford --- drivers/infiniband/core/agent.c | 4 +- drivers/infiniband/core/mad.c | 45 ++++++++--------------- drivers/infiniband/core/multicast.c | 8 +--- drivers/infiniband/core/opa_smi.h | 4 +- drivers/infiniband/core/sa_query.c | 8 +--- drivers/infiniband/core/smi.c | 37 +++++++++---------- drivers/infiniband/core/smi.h | 4 +- drivers/infiniband/core/sysfs.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 12 +----- drivers/infiniband/ulp/srp/ib_srp.c | 12 +----- include/rdma/ib_verbs.h | 20 ++++++++-- 11 files changed, 66 insertions(+), 90 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index c7dcfe4ca5f1..0429040304fd 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -88,7 +88,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh * struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); @@ -122,7 +122,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh * memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index a4b1466c1bf6..c82d751aede1 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -769,7 +769,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, mad_agent_priv->qp_info->port_priv->port_num); - if (device->node_type == RDMA_NODE_IB_SWITCH && + if (rdma_cap_ib_switch(device) && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) port_num = send_wr->wr.ud.port_num; else @@ -787,7 +787,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, if ((opa_get_smp_direction(opa_smp) ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == OPA_LID_PERMISSIVE && - opa_smi_handle_dr_smp_send(opa_smp, device->node_type, + opa_smi_handle_dr_smp_send(opa_smp, + rdma_cap_ib_switch(device), port_num) == IB_SMI_DISCARD) { ret = -EINVAL; dev_err(&device->dev, "OPA Invalid directed route\n"); @@ -810,7 +811,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } else { if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == IB_LID_PERMISSIVE && - smi_handle_dr_smp_send(smp, device->node_type, port_num) == + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == IB_SMI_DISCARD) { ret = -EINVAL; dev_err(&device->dev, "Invalid directed route\n"); @@ -2030,7 +2031,7 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv struct ib_smp *smp = (struct ib_smp *)recv->mad; if (smi_handle_dr_smp_recv(smp, - port_priv->device->node_type, + rdma_cap_ib_switch(port_priv->device), port_num, port_priv->device->phys_port_cnt) == IB_SMI_DISCARD) @@ -2042,13 +2043,13 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv if (retsmi == IB_SMI_SEND) { /* don't forward */ if (smi_handle_dr_smp_send(smp, - port_priv->device->node_type, + rdma_cap_ib_switch(port_priv->device), port_num) == IB_SMI_DISCARD) return IB_SMI_DISCARD; if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) return IB_SMI_DISCARD; - } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { + } else if (rdma_cap_ib_switch(port_priv->device)) { /* forward case for switches */ memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; @@ -2115,7 +2116,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv, struct opa_smp *smp = (struct opa_smp *)recv->mad; if (opa_smi_handle_dr_smp_recv(smp, - port_priv->device->node_type, + rdma_cap_ib_switch(port_priv->device), port_num, port_priv->device->phys_port_cnt) == IB_SMI_DISCARD) @@ -2127,7 +2128,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv, if (retsmi == IB_SMI_SEND) { /* don't forward */ if (opa_smi_handle_dr_smp_send(smp, - port_priv->device->node_type, + rdma_cap_ib_switch(port_priv->device), port_num) == IB_SMI_DISCARD) return IB_SMI_DISCARD; @@ -2135,7 +2136,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv, IB_SMI_DISCARD) return IB_SMI_DISCARD; - } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { + } else if (rdma_cap_ib_switch(port_priv->device)) { /* forward case for switches */ memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; @@ -2235,7 +2236,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, goto out; } - if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(port_priv->device)) port_num = wc->port_num; else port_num = port_priv->port_num; @@ -3297,17 +3298,11 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) static void ib_mad_init_device(struct ib_device *device) { - int start, end, i; + int start, i; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - start = 0; - end = 0; - } else { - start = 1; - end = device->phys_port_cnt; - } + start = rdma_start_port(device); - for (i = start; i <= end; i++) { + for (i = start; i <= rdma_end_port(device); i++) { if (!rdma_cap_ib_mad(device, i)) continue; @@ -3342,17 +3337,9 @@ error: static void ib_mad_remove_device(struct ib_device *device) { - int start, end, i; + int i; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - start = 0; - end = 0; - } else { - start = 1; - end = device->phys_port_cnt; - } - - for (i = start; i <= end; i++) { + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { if (!rdma_cap_ib_mad(device, i)) continue; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 1244f02a5c6d..2cb865c7ce7a 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -812,12 +812,8 @@ static void mcast_add_one(struct ib_device *device) if (!dev) return; - if (device->node_type == RDMA_NODE_IB_SWITCH) - dev->start_port = dev->end_port = 0; - else { - dev->start_port = 1; - dev->end_port = device->phys_port_cnt; - } + dev->start_port = rdma_start_port(device); + dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (!rdma_cap_ib_mcast(device, dev->start_port + i)) diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h index 62d91bfa4cb7..3bfab3505a29 100644 --- a/drivers/infiniband/core/opa_smi.h +++ b/drivers/infiniband/core/opa_smi.h @@ -39,12 +39,12 @@ #include "smi.h" -enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, u8 node_type, +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, int port_num, int phys_port_cnt); int opa_smi_get_fwd_port(struct opa_smp *smp); extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - u8 node_type, int port_num); + bool is_switch, int port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 0fae85062a65..ca919f429666 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1156,12 +1156,8 @@ static void ib_sa_add_one(struct ib_device *device) int s, e, i; int count = 0; - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } + s = rdma_start_port(device); + e = rdma_end_port(device); sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index 368a561d1a5d..f19b23817c2b 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -41,7 +41,7 @@ #include "smi.h" #include "opa_smi.h" -static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, const u8 *return_path, @@ -64,7 +64,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, /* C14-9:2 */ if (*hop_ptr && *hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (!is_switch) return IB_SMI_DISCARD; /* return_path set when received */ @@ -77,7 +77,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, if (*hop_ptr == hop_cnt) { /* return_path set when received */ (*hop_ptr)++; - return (node_type == RDMA_NODE_IB_SWITCH || + return (is_switch || dr_dlid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } @@ -96,7 +96,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, /* C14-13:2 */ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (!is_switch) return IB_SMI_DISCARD; (*hop_ptr)--; @@ -108,7 +108,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, if (*hop_ptr == 1) { (*hop_ptr)--; /* C14-13:3 -- SMPs destined for SM shouldn't be here */ - return (node_type == RDMA_NODE_IB_SWITCH || + return (is_switch || dr_slid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } @@ -127,9 +127,9 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, * Return IB_SMI_DISCARD if the SMP should be discarded */ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num) + bool is_switch, int port_num) { - return __smi_handle_dr_smp_send(node_type, port_num, + return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, smp->initial_path, smp->return_path, @@ -139,9 +139,9 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, } enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - u8 node_type, int port_num) + bool is_switch, int port_num) { - return __smi_handle_dr_smp_send(node_type, port_num, + return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, smp->route.dr.initial_path, smp->route.dr.return_path, @@ -152,7 +152,7 @@ enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, OPA_LID_PERMISSIVE); } -static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, int phys_port_cnt, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, @@ -173,7 +173,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, /* C14-9:2 -- intermediate hop */ if (*hop_ptr && *hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (!is_switch) return IB_SMI_DISCARD; return_path[*hop_ptr] = port_num; @@ -188,7 +188,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, return_path[*hop_ptr] = port_num; /* hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH || + return (is_switch || dr_dlid_is_permissive ? IB_SMI_HANDLE : IB_SMI_DISCARD); } @@ -208,7 +208,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, /* C14-13:2 */ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) + if (!is_switch) return IB_SMI_DISCARD; /* hop_ptr updated when sending */ @@ -224,8 +224,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, return IB_SMI_HANDLE; } /* hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH ? - IB_SMI_HANDLE : IB_SMI_DISCARD); + return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD); } /* C14-13:4 -- hop_ptr = 0 -> give to SM */ @@ -238,10 +237,10 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, * Adjust information for a received SMP * Return IB_SMI_DISCARD if the SMP should be dropped */ -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, int port_num, int phys_port_cnt) { - return __smi_handle_dr_smp_recv(node_type, port_num, phys_port_cnt, + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, smp->initial_path, smp->return_path, @@ -254,10 +253,10 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, * Adjust information for a received SMP * Return IB_SMI_DISCARD if the SMP should be dropped */ -enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, u8 node_type, +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, int port_num, int phys_port_cnt) { - return __smi_handle_dr_smp_recv(node_type, port_num, phys_port_cnt, + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, smp->route.dr.initial_path, smp->route.dr.return_path, diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h index aff96bac49b4..33c91c8a16e9 100644 --- a/drivers/infiniband/core/smi.h +++ b/drivers/infiniband/core/smi.h @@ -51,12 +51,12 @@ enum smi_forward_action { IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ }; -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, int port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num); + bool is_switch, int port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index ed6b6c85c334..0b84a9cdfe5b 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -870,7 +870,7 @@ int ib_device_register_sysfs(struct ib_device *device, goto err_put; } - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { ret = add_port(device, 0, port_callback); if (ret) goto err_put; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index da149c278cb8..0d8336e91e40 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1684,7 +1684,7 @@ static void ipoib_add_one(struct ib_device *device) struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; - int s, e, p; + int p; int count = 0; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); @@ -1693,15 +1693,7 @@ static void ipoib_add_one(struct ib_device *device) INIT_LIST_HEAD(dev_list); - if (device->node_type == RDMA_NODE_IB_SWITCH) { - s = 0; - e = 0; - } else { - s = 1; - e = device->phys_port_cnt; - } - - for (p = s; p <= e; ++p) { + for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 267dc4f75502..d40e321768a0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3379,7 +3379,7 @@ static void srp_add_one(struct ib_device *device) struct srp_device *srp_dev; struct ib_device_attr *dev_attr; struct srp_host *host; - int mr_page_shift, s, e, p; + int mr_page_shift, p; u64 max_pages_per_mr; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); @@ -3443,15 +3443,7 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - s = 0; - e = 0; - } else { - s = 1; - e = device->phys_port_cnt; - } - - for (p = s; p <= e; ++p) { + for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { host = srp_add_port(srp_dev, p); if (host) list_add_tail(&host->list, &srp_dev->dev_list); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 986fddb08579..b0f898e3b2e7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1745,6 +1745,7 @@ struct ib_device { char node_desc[64]; __be64 node_guid; u32 local_dma_lkey; + u16 is_switch:1; u8 node_type; u8 phys_port_cnt; @@ -1823,6 +1824,20 @@ int ib_query_port(struct ib_device *device, enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); +/** + * rdma_cap_ib_switch - Check if the device is IB switch + * @device: Device to check + * + * Device driver is responsible for setting is_switch bit on + * in ib_device structure at init time. + * + * Return: true if the device is IB switch. + */ +static inline bool rdma_cap_ib_switch(const struct ib_device *device) +{ + return device->is_switch; +} + /** * rdma_start_port - Return the first valid port number for the device * specified @@ -1833,7 +1848,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, */ static inline u8 rdma_start_port(const struct ib_device *device) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; + return rdma_cap_ib_switch(device) ? 0 : 1; } /** @@ -1846,8 +1861,7 @@ static inline u8 rdma_start_port(const struct ib_device *device) */ static inline u8 rdma_end_port(const struct ib_device *device) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; + return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) From cd4cd565e049c6e734a12754d91d1142c25f6a64 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 25 Jun 2015 12:04:49 -0400 Subject: [PATCH 02/24] IB/mad: Fix compare between big endian and cpu endian The define OPA_LID_PERMISSIVE is big endian and was compared to the cpu endian variable opa_drslid. Problem caught by 0-day build infrastructure. Fixes: 8e4349d13f33 (IB/mad: Add final OPA MAD processing) Signed-off-by: Ira Weiny Reviewed-by: Mike Marciniszyn Reviewed-by: John, Jubin Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c82d751aede1..786fc51bf04b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -795,7 +795,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, goto out; } opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); - if (opa_drslid != OPA_LID_PERMISSIVE && + if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && opa_drslid & 0xffff0000) { ret = -EINVAL; dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", From 3b8ab700af0a5c5e59c833f8a88f94b97499f5e5 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 25 Jun 2015 09:52:50 -0400 Subject: [PATCH 03/24] IB/mad: Remove improper use of BUG_ON We recently added BUG_ON's which were inappropriate for a condition which should never happen. Change these to be WARN_ON_ONCE as a debugging aid. Fixes: 4cd7c9479aff ('IB/mad: Add support for additional MAD info to/from drivers') Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ehca/ehca_sqp.c | 5 +++-- drivers/infiniband/hw/ipath/ipath_mad.c | 5 +++-- drivers/infiniband/hw/mlx4/mad.c | 5 +++-- drivers/infiniband/hw/mlx5/mad.c | 5 +++-- drivers/infiniband/hw/mthca/mthca_mad.c | 5 +++-- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 5 +++-- drivers/infiniband/hw/qib/qib_mad.c | 5 +++-- 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c index 12b5bc23832b..376b031c2c7f 100644 --- a/drivers/infiniband/hw/ehca/ehca_sqp.c +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -226,8 +226,9 @@ int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc) return IB_MAD_RESULT_FAILURE; diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 948188e37f95..ad3a926ab3c5 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -1499,8 +1499,9 @@ int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 85a50df2f203..9a810e302f6b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -861,8 +861,9 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; switch (rdma_port_get_link_layer(ibdev, port_num)) { case IB_LINK_LAYER_INFINIBAND: diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 01fc97db45d6..b84d13a487cc 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -68,8 +68,9 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 6b2418b74c99..7c3f2fb44ba5 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -209,8 +209,9 @@ int mthca_process_mad(struct ib_device *ibdev, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; /* Forward locally generated traps to the SM */ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 4bafa15708d0..29b27675dd70 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -215,8 +215,9 @@ int ocrdma_process_mad(struct ib_device *ibdev, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_PERF_MGMT: diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 05e3242d8442..9625e7c438e5 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -2412,8 +2412,9 @@ int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - BUG_ON(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad)); + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: From b356c1c1f90a347b6f67d9272dda7ecf47e0b46c Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Wed, 24 Jun 2015 10:12:13 +0530 Subject: [PATCH 04/24] IB/srpt: Convert use of __constant_cpu_to_beXX to cpu_to_beXX In little endian cases, the macro cpu_to_be{16,32,64} unfolds to __swab{16,32,64} which provides special case for constants. In big endian cases, __constant_cpu_to_be{16,32,64} and cpu_to_be{16,32,64} expand directly to the same expression. So, replace __constant_cpu_to_be{16,32,64} with cpu_to_be{16,32,64} with the goal of getting rid of the definitions of __constant_cpu_to_be{16,32,64} completely. The Coccinelle semantic patch that performs this transformation is as follows: @@expression x;@@ ( - __constant_cpu_to_be16(x) + cpu_to_be16(x) | - __constant_cpu_to_be32(x) + cpu_to_be32(x) | - __constant_cpu_to_be64(x) + cpu_to_be64(x) ) Signed-off-by: Vaishali Thakkar Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 71 +++++++++++++-------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 82897ca17f32..60ff0a2390e5 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -302,7 +302,7 @@ static void srpt_get_iou(struct ib_dm_mad *mad) int i; ioui = (struct ib_dm_iou_info *)mad->data; - ioui->change_id = __constant_cpu_to_be16(1); + ioui->change_id = cpu_to_be16(1); ioui->max_controllers = 16; /* set present for slot 1 and empty for the rest */ @@ -330,13 +330,13 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, if (!slot || slot > 16) { mad->mad_hdr.status - = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + = cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); return; } if (slot > 2) { mad->mad_hdr.status - = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + = cpu_to_be16(DM_MAD_STATUS_NO_IOC); return; } @@ -348,10 +348,10 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver); iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); iocp->subsys_device_id = 0x0; - iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS); - iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS); - iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL); - iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION); + iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS); + iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS); + iocp->protocol = cpu_to_be16(SRP_PROTOCOL); + iocp->protocol_version = cpu_to_be16(SRP_PROTOCOL_VERSION); iocp->send_queue_depth = cpu_to_be16(sdev->srq_size); iocp->rdma_read_depth = 4; iocp->send_size = cpu_to_be32(srp_max_req_size); @@ -379,13 +379,13 @@ static void srpt_get_svc_entries(u64 ioc_guid, if (!slot || slot > 16) { mad->mad_hdr.status - = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + = cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); return; } if (slot > 2 || lo > hi || hi > 1) { mad->mad_hdr.status - = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + = cpu_to_be16(DM_MAD_STATUS_NO_IOC); return; } @@ -436,7 +436,7 @@ static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, break; default: rsp_mad->mad_hdr.status = - __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); break; } } @@ -493,11 +493,11 @@ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, break; case IB_MGMT_METHOD_SET: dm_mad->mad_hdr.status = - __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); break; default: dm_mad->mad_hdr.status = - __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD); + cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD); break; } @@ -1535,7 +1535,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, memset(srp_rsp, 0, sizeof *srp_rsp); srp_rsp->opcode = SRP_RSP; srp_rsp->req_lim_delta = - __constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); + cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); srp_rsp->tag = tag; srp_rsp->status = status; @@ -1585,8 +1585,8 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, memset(srp_rsp, 0, sizeof *srp_rsp); srp_rsp->opcode = SRP_RSP; - srp_rsp->req_lim_delta = __constant_cpu_to_be32(1 - + atomic_xchg(&ch->req_lim_delta, 0)); + srp_rsp->req_lim_delta = + cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); srp_rsp->tag = tag; srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; @@ -1630,7 +1630,7 @@ static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) switch (len) { case 8: if ((*((__be64 *)lun) & - __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) + cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) goto out_err; break; case 4: @@ -2449,8 +2449,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } if (it_iu_len > srp_max_req_size || it_iu_len < 64) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); ret = -EINVAL; pr_err("rejected SRP_LOGIN_REQ because its" " length (%d bytes) is out of range (%d .. %d)\n", @@ -2459,8 +2459,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } if (!sport->enabled) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); ret = -EINVAL; pr_err("rejected SRP_LOGIN_REQ because the target port" " has not yet been enabled\n"); @@ -2505,8 +2505,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) || *(__be64 *)(req->target_port_id + 8) != cpu_to_be64(srpt_service_guid)) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); ret = -ENOMEM; pr_err("rejected SRP_LOGIN_REQ because it" " has an invalid target port identifier.\n"); @@ -2515,8 +2515,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ch = kzalloc(sizeof *ch, GFP_KERNEL); if (!ch) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_err("rejected SRP_LOGIN_REQ because no memory.\n"); ret = -ENOMEM; goto reject; @@ -2552,8 +2552,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ret = srpt_create_ch_ib(ch); if (ret) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_err("rejected SRP_LOGIN_REQ because creating" " a new RDMA channel failed.\n"); goto free_ring; @@ -2561,8 +2561,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ret = srpt_ch_qp_rtr(ch, ch->qp); if (ret) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_err("rejected SRP_LOGIN_REQ because enabling" " RTR failed (error code = %d)\n", ret); goto destroy_ib; @@ -2580,15 +2579,15 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (!nacl) { pr_info("Rejected login because no ACL has been" " configured yet for initiator %s.\n", ch->sess_name); - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); goto destroy_ib; } ch->sess = transport_init_session(TARGET_PROT_NORMAL); if (IS_ERR(ch->sess)) { - rej->reason = __constant_cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_debug("Failed to create session\n"); goto deregister_session; } @@ -2604,8 +2603,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rsp->max_it_iu_len = req->req_it_iu_len; rsp->max_ti_iu_len = req->req_it_iu_len; ch->max_ti_iu_len = it_iu_len; - rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); rsp->req_lim_delta = cpu_to_be32(ch->rq_size); atomic_set(&ch->req_lim, ch->rq_size); atomic_set(&ch->req_lim_delta, 0); @@ -2655,8 +2654,8 @@ free_ch: reject: rej->opcode = SRP_LOGIN_REJ; rej->tag = req->tag; - rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, (void *)rej, sizeof *rej); From 3fdf70acec13efc7ecf254f5a364df06348bfd2c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 25 Jun 2015 13:34:15 +0300 Subject: [PATCH 05/24] IB/srp: Avoid using uninitialized variable We might return res which is not initialized. Also reduce code duplication by exporting srp_parse_tmo so srp_tmo_set can reuse it. Detected by Coverity. Signed-off-by: Sagi Grimberg Signed-off-by: Jenny Falkovich Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 11 ++++------- drivers/scsi/scsi_transport_srp.c | 3 ++- include/scsi/scsi_transport_srp.h | 1 + 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index d40e321768a0..31a20b462266 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -161,13 +161,10 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp) { int tmo, res; - if (strncmp(val, "off", 3) != 0) { - res = kstrtoint(val, 0, &tmo); - if (res) - goto out; - } else { - tmo = -1; - } + res = srp_parse_tmo(&tmo, val); + if (res) + goto out; + if (kp->arg == &srp_reconnect_delay) res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, srp_dev_loss_tmo); diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index a85292b1d09d..e3cd3ece4412 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -203,7 +203,7 @@ static ssize_t srp_show_tmo(char *buf, int tmo) return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); } -static int srp_parse_tmo(int *tmo, const char *buf) +int srp_parse_tmo(int *tmo, const char *buf) { int res = 0; @@ -214,6 +214,7 @@ static int srp_parse_tmo(int *tmo, const char *buf) return res; } +EXPORT_SYMBOL(srp_parse_tmo); static ssize_t show_reconnect_delay(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index cdb05dd1d440..d40d3ef25707 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -119,6 +119,7 @@ extern struct srp_rport *srp_rport_add(struct Scsi_Host *, extern void srp_rport_del(struct srp_rport *); extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo); +int srp_parse_tmo(int *tmo, const char *buf); extern int srp_reconnect_rport(struct srp_rport *rport); extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); From be4b499323bf7291b491c6df51baae62f45b8404 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 25 Jun 2015 17:13:22 +0300 Subject: [PATCH 06/24] IB/cm: Do not queue work to a device that's going away Whenever ib_cm gets remove_one call, like when there is a hot-unplug event, the driver should mark itself as going_down and confirm that no new works are going to be queued for that device. so, the order of the actions are: 1. mark the going_down bit. 2. flush the wq. 3. [make sure no new works for that device.] 4. unregister mad agent. otherwise, works that are already queued can be scheduled after the mad agent was freed. Signed-off-by: Erez Shitrit Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 61 ++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index dbddddd6fb5d..3a972ebf3c0d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -169,6 +169,7 @@ struct cm_device { struct ib_device *ib_device; struct device *device; u8 ack_delay; + int going_down; struct cm_port *port[0]; }; @@ -805,6 +806,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) { int wait_time; unsigned long flags; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) + return; spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); @@ -818,8 +824,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); - queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, - msecs_to_jiffies(wait_time)); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irq(&cm.lock); + cm_id_priv->timewait_info = NULL; } @@ -3305,6 +3317,11 @@ static int cm_establish(struct ib_cm_id *cm_id) struct cm_work *work; unsigned long flags; int ret = 0; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) + return -ENODEV; work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) @@ -3343,7 +3360,17 @@ static int cm_establish(struct ib_cm_id *cm_id) work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!cm_dev->going_down) { + queue_delayed_work(cm.wq, &work->work, 0); + } else { + kfree(work); + ret = -ENODEV; + } + spin_unlock_irq(&cm.lock); + out: return ret; } @@ -3394,6 +3421,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, enum ib_cm_event_type event; u16 attr_id; int paths = 0; + int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: @@ -3452,7 +3480,19 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, @@ -3771,7 +3811,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->ib_device = ib_device; cm_get_ack_delay(cm_dev); - + cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); @@ -3864,14 +3904,23 @@ static void cm_remove_one(struct ib_device *ib_device) list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - ib_unregister_mad_agent(port->mad_agent); + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ flush_workqueue(cm.wq); + ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } device_unregister(cm_dev->device); From 8a7ff14dcb40bade309cd2ad04c746bb1d169c4e Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 1 Jul 2015 14:31:02 +0300 Subject: [PATCH 07/24] IB/mlx4: Do not attemp to report HCA clock offset on VFs mlx4 VFs can provide CQE raw time-stamping services, but they don't have the hca core clock mapped to their PCI bars. As such, we should not attempt to query and report the clock offset to user space for VFs. Doing so causes query_device over VFs to fail with -ENOSUPP. Fixes: 4b664c4355b2 ('IB/mlx4: Add support for CQ time-stamping') Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 067a691ecbed..f419a728a22b 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -253,14 +253,15 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL; props->timestamp_mask = 0xFFFFFFFFFFFFULL; - err = mlx4_get_internal_clock_params(dev->dev, &clock_params); - if (err) - goto out; + if (!mlx4_is_slave(dev->dev)) + err = mlx4_get_internal_clock_params(dev->dev, &clock_params); if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) { - resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE; resp.response_length += sizeof(resp.hca_core_clock_offset); - resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP; + if (!err && !mlx4_is_slave(dev->dev)) { + resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP; + resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE; + } } if (uhw->outlen) { From 58e9cc90cda7bc732856a2ad3210328fbc4f6ca2 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Wed, 1 Jul 2015 14:31:01 +0300 Subject: [PATCH 08/24] IB/IPoIB: Fix bad error flow in ipoib_add_port() Error values of ib_query_port() and ib_query_device() weren't propagated correctly. Because of that, ipoib_add_port() could return NULL value, which escaped the IS_ERR() check in ipoib_add_one() and we crashed. Signed-off-by: Amir Vadai Signed-off-by: Or Gerlitz Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 0d8336e91e40..da3e7e7d5e80 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1577,7 +1577,8 @@ static struct net_device *ipoib_add_port(const char *format, SET_NETDEV_DEV(priv->dev, hca->dma_device); priv->dev->dev_id = port - 1; - if (!ib_query_port(hca, port, &attr)) + result = ib_query_port(hca, port, &attr); + if (!result) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", @@ -1598,7 +1599,8 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } - if (ipoib_set_dev_features(priv, hca)) + result = ipoib_set_dev_features(priv, hca); + if (result) goto device_init_failed; /* From a7f2f24cd716d7f1119d46929f41a8436f9c252f Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 2 Jul 2015 12:47:44 -0500 Subject: [PATCH 09/24] RDMA/core: Fixes for port mapper client registration Fixes to allow clients to make remove mapping requests, after they have provided the user space service with the mapping information, they are using when the service is restarted. 1) Adding IWPM_REG_VALID, IWPM_REG_INCOMPL and IWPM_REG_UNDEF registration types for the port mapper clients and functions to set/check the registration type. 2) If the port mapper user space service is not available to register the client, then its registration stays IWPM_REG_UNDEF and the registration isn't checked until the service becomes available (no mappings are possible, if the user space service isn't running). 3) After the service is restarted, the user space port mapper pid is set to valid and the client registration is set to IWPM_REG_INCOMPL to allow the client to make remove mapping requests. Signed-off-by: Tatyana Nikolova Reviewed-by: Steve Wise Tested-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/iwpm_msg.c | 33 ++++++++++++++--------------- drivers/infiniband/core/iwpm_util.c | 12 +++++++++-- drivers/infiniband/core/iwpm_util.h | 28 ++++++++++++++++++------ 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index e6ffa2e66c1a..22a3abee2a54 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -67,7 +67,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto pid_query_error; } - if (iwpm_registered_client(nl_client)) + if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || + iwpm_user_pid == IWPM_PID_UNAVAILABLE) return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); if (!skb) { @@ -106,7 +107,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ - iwpm_set_registered(nl_client, 1); iwpm_user_pid = IWPM_PID_UNAVAILABLE; err_str = "Unable to send a nlmsg"; goto pid_query_error; @@ -144,12 +144,12 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto add_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto add_mapping_error; } - if (!iwpm_valid_pid()) - return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; @@ -214,12 +214,12 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) err_str = "Invalid port mapper client"; goto query_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto query_mapping_error; } - if (!iwpm_valid_pid()) - return 0; ret = -ENOMEM; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); if (!skb) { @@ -288,12 +288,12 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) err_str = "Invalid port mapper client"; goto remove_mapping_error; } - if (!iwpm_registered_client(nl_client)) { + if (!iwpm_valid_pid()) + return 0; + if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { err_str = "Unregistered port mapper client"; goto remove_mapping_error; } - if (!iwpm_valid_pid()) - return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); if (!skb) { ret = -ENOMEM; @@ -388,7 +388,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); if (iwpm_valid_client(nl_client)) - iwpm_set_registered(nl_client, 1); + iwpm_set_registration(nl_client, IWPM_REG_VALID); register_pid_response_exit: nlmsg_request->request_done = 1; /* always for found nlmsg_request */ @@ -644,7 +644,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; const char *msg_type = "Mapping Info response"; - int iwpm_pid; u8 nl_client; char *iwpm_name; u16 iwpm_version; @@ -669,14 +668,14 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) __func__, nl_client); return ret; } - iwpm_set_registered(nl_client, 0); + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_user_pid = cb->nlh->nlmsg_pid; if (!iwpm_mapinfo_available()) return 0; - iwpm_pid = cb->nlh->nlmsg_pid; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", - __func__, iwpm_pid); - ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + __func__, iwpm_user_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } EXPORT_SYMBOL(iwpm_mapping_info_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index a626795bf9c7..5fb089e91353 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -78,6 +78,7 @@ init_exit: mutex_unlock(&iwpm_admin_lock); if (!ret) { iwpm_set_valid(nl_client, 1); + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); } @@ -106,6 +107,7 @@ int iwpm_exit(u8 nl_client) } mutex_unlock(&iwpm_admin_lock); iwpm_set_valid(nl_client, 0); + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } EXPORT_SYMBOL(iwpm_exit); @@ -397,17 +399,23 @@ void iwpm_set_valid(u8 nl_client, int valid) } /* valid client */ -int iwpm_registered_client(u8 nl_client) +u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ -void iwpm_set_registered(u8 nl_client, int reg) +void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } +/* valid client */ +u32 iwpm_check_registration(u8 nl_client, u32 reg) +{ + return (iwpm_get_registration(nl_client) & reg); +} + int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index ee2d9ff095be..b7b9e194ce81 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -58,6 +58,10 @@ #define IWPM_PID_UNDEFINED -1 #define IWPM_PID_UNAVAILABLE -2 +#define IWPM_REG_UNDEF 0x01 +#define IWPM_REG_VALID 0x02 +#define IWPM_REG_INCOMPL 0x04 + struct iwpm_nlmsg_request { struct list_head inprocess_list; __u32 nlmsg_seq; @@ -88,7 +92,7 @@ struct iwpm_admin_data { atomic_t refcount; atomic_t nlmsg_seq; int client_list[RDMA_NL_NUM_CLIENTS]; - int reg_list[RDMA_NL_NUM_CLIENTS]; + u32 reg_list[RDMA_NL_NUM_CLIENTS]; }; /** @@ -159,19 +163,31 @@ int iwpm_valid_client(u8 nl_client); void iwpm_set_valid(u8 nl_client, int valid); /** - * iwpm_registered_client - Check if the port mapper client is registered + * iwpm_check_registration - Check if the client registration + * matches the given one * @nl_client: The index of the netlink client + * @reg: The given registration type to compare with * * Call iwpm_register_pid() to register a client + * Returns true if the client registration matches reg, + * otherwise returns false */ -int iwpm_registered_client(u8 nl_client); +u32 iwpm_check_registration(u8 nl_client, u32 reg); /** - * iwpm_set_registered - Set the port mapper client to registered or not + * iwpm_set_registration - Set the client registration * @nl_client: The index of the netlink client - * @reg: 1 if registered or 0 if not + * @reg: Registration type to set */ -void iwpm_set_registered(u8 nl_client, int reg); +void iwpm_set_registration(u8 nl_client, u32 reg); + +/** + * iwpm_get_registration + * @nl_client: The index of the netlink client + * + * Returns the client registration type + */ +u32 iwpm_get_registration(u8 nl_client); /** * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of From 165d68228906f2866a52069ef1176ab70fa9e216 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 2 Jul 2015 12:49:40 -0500 Subject: [PATCH 10/24] RDMA/nes: Fix for resolving the neigh Neighbor resolution doesn't work without this fix Signed-off-by: Tatyana Nikolova Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 9047af429906..8a3ad170d790 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1520,8 +1520,9 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi int rc = arpindex; struct net_device *netdev; struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + __be32 dst_ipaddr = htonl(dst_ip); - rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0); + rt = ip_route_output(&init_net, dst_ipaddr, nesvnic->local_ipaddr, 0, 0); if (IS_ERR(rt)) { printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n", __func__, dst_ip); @@ -1533,7 +1534,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi else netdev = nesvnic->netdev; - neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev); + neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr); rcu_read_lock(); if (neigh) { From 0a691272509d59ea0adbaa0c420ca0a71c9d0419 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Thu, 2 Jul 2015 12:52:29 -0500 Subject: [PATCH 11/24] RDMA/nes: Fix for incorrect recording of the MAC address Fix for incorrect recording of the MAC address Signed-off-by: Tatyana Nikolova Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 02120d340d50..4713dd7ed764 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -3861,7 +3861,7 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( - (((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]); + (((u32)mac_addr[0]) << 8) | (u32)mac_addr[1]); } else { cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0; cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0; From 4fabb59449aa44a585b3603ffdadd4c5f4d0c033 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 6 Jul 2015 14:35:11 +0800 Subject: [PATCH 12/24] rds: rds_ib_device.refcount overflow Fixes: 3e0249f9c05c ("RDS/IB: add refcount tracking to struct rds_ib_device") There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev->refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); 118 if (atomic_dec_and_test(&rds_ibdev->refcount)) 119 queue_work(rds_wq, &rds_ibdev->free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bff6ba4..657ba9f5d308 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) From 31b57b87fddf547bf3e98306b41bc82e111396fa Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Tue, 7 Jul 2015 17:45:12 +0300 Subject: [PATCH 13/24] IB/ucma: Fix lockdep warning in ucma_lock_files The ucma_lock_files() locks the mut mutex on two files, e.g. for migrating an ID. Use mutex_lock_nested() to prevent the warning below. ============================================= [ INFO: possible recursive locking detected ] 4.1.0-rc6-hmm+ #40 Tainted: G O --------------------------------------------- pingpong_rpc_se/10260 is trying to acquire lock: (&file->mut){+.+.+.}, at: [] ucma_migrate_id+0xc5/0x248 [rdma_ucm] but task is already holding lock: (&file->mut){+.+.+.}, at: [] ucma_migrate_id+0xbb/0x248 [rdma_ucm] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&file->mut); lock(&file->mut); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by pingpong_rpc_se/10260: #0: (&file->mut){+.+.+.}, at: [] ucma_migrate_id+0xbb/0x248 [rdma_ucm] stack backtrace: CPU: 0 PID: 10260 Comm: pingpong_rpc_se Tainted: G O 4.1.0-rc6-hmm+ #40 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 ffff8801f85b63d0 ffff880195677b58 ffffffff81668f49 0000000000000001 ffffffff825cbbe0 ffff880195677c38 ffffffff810bb991 ffff880100000000 ffff880100000000 ffff880100000001 ffff8801f85b7010 ffffffff8121bee9 Call Trace: [] dump_stack+0x4f/0x6e [] __lock_acquire+0x741/0x1820 [] ? dput+0x29/0x320 [] lock_acquire+0xc8/0x240 [] ? ucma_migrate_id+0xc5/0x248 [rdma_ucm] [] ? mutex_lock_nested+0x291/0x3e0 [] mutex_lock_nested+0x65/0x3e0 [] ? ucma_migrate_id+0xc5/0x248 [rdma_ucm] [] ? trace_hardirqs_on+0xd/0x10 [] ? mutex_unlock+0xe/0x10 [] ucma_migrate_id+0xc5/0x248 [rdma_ucm] [] ucma_write+0xa4/0xb0 [rdma_ucm] [] __vfs_write+0x34/0x100 [] ? __audit_syscall_entry+0xac/0x110 [] ? current_kernel_time+0xc5/0xe0 [] ? security_file_permission+0x23/0x90 [] ? rw_verify_area+0x5d/0xe0 [] vfs_write+0xab/0x120 [] SyS_write+0x59/0xd0 [] ? __audit_syscall_entry+0xac/0x110 [] system_call_fastpath+0x12/0x76 Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ad45469f7582..a700cdb13f3d 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1354,10 +1354,10 @@ static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) /* Acquire mutex's based on pointer comparison to prevent deadlock. */ if (file1 < file2) { mutex_lock(&file1->mut); - mutex_lock(&file2->mut); + mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); } else { mutex_lock(&file2->mut); - mutex_lock(&file1->mut); + mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); } } From 8b7cce0dae956b329d09099f8739821936cf7c0f Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Tue, 7 Jul 2015 17:45:13 +0300 Subject: [PATCH 14/24] IB/ipoib: Prevent lockdep warning in __ipoib_ib_dev_flush __ipoib_ib_dev_flush calls itself recursively on child devices, and lockdep complains about locking vlan_rwsem twice (see below). Use down_read_nested instead of down_read to prevent the warning. ============================================= [ INFO: possible recursive locking detected ] 4.1.0-rc4+ #36 Tainted: G O --------------------------------------------- kworker/u20:2/261 is trying to acquire lock: (&priv->vlan_rwsem){.+.+..}, at: [] __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] but task is already holding lock: (&priv->vlan_rwsem){.+.+..}, at: [] __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&priv->vlan_rwsem); lock(&priv->vlan_rwsem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by kworker/u20:2/261: #0: ("%s""ipoib_flush"){.+.+..}, at: [] process_one_work+0x15c/0x760 #1: ((&priv->flush_heavy)){+.+...}, at: [] process_one_work+0x15c/0x760 #2: (&priv->vlan_rwsem){.+.+..}, at: [] __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] stack backtrace: CPU: 3 PID: 261 Comm: kworker/u20:2 Tainted: G O 4.1.0-rc4+ #36 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 Workqueue: ipoib_flush ipoib_ib_dev_flush_heavy [ib_ipoib] ffff8801c6c54790 ffff8801c9927af8 ffffffff81665238 0000000000000001 ffffffff825b5b30 ffff8801c9927bd8 ffffffff810bba51 ffff880100000000 ffffffff00000001 ffff880100000001 ffff8801c6c55428 ffff8801c6c54790 Call Trace: [] dump_stack+0x4f/0x6f [] __lock_acquire+0x741/0x1820 [] lock_acquire+0xc8/0x240 [] ? __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] [] down_read+0x4c/0x70 [] ? __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] [] __ipoib_ib_dev_flush+0x3a/0x2b0 [ib_ipoib] [] __ipoib_ib_dev_flush+0x5a/0x2b0 [ib_ipoib] [] ipoib_ib_dev_flush_heavy+0x1a/0x20 [ib_ipoib] [] process_one_work+0x201/0x760 [] ? process_one_work+0x15c/0x760 [] worker_thread+0x120/0x4d0 [] ? process_one_work+0x760/0x760 [] ? process_one_work+0x760/0x760 [] kthread+0xfe/0x120 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x42/0x70 [] ? __init_kthread_worker+0x70/0x70 Signed-off-by: Haggai Eran Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 63b92cbb29ad..058150ff5aa1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -985,20 +985,21 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv) } static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, - enum ipoib_flush_level level) + enum ipoib_flush_level level, + int nesting) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; int result; - down_read(&priv->vlan_rwsem); + down_read_nested(&priv->vlan_rwsem, nesting); /* * Flush any child interfaces too -- they might be up even if * the parent is down. */ list_for_each_entry(cpriv, &priv->child_intfs, list) - __ipoib_ib_dev_flush(cpriv, level); + __ipoib_ib_dev_flush(cpriv, level, nesting + 1); up_read(&priv->vlan_rwsem); @@ -1076,7 +1077,7 @@ void ipoib_ib_dev_flush_light(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_light); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0); } void ipoib_ib_dev_flush_normal(struct work_struct *work) @@ -1084,7 +1085,7 @@ void ipoib_ib_dev_flush_normal(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_normal); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0); } void ipoib_ib_dev_flush_heavy(struct work_struct *work) @@ -1092,7 +1093,7 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, flush_heavy); - __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0); } void ipoib_ib_dev_cleanup(struct net_device *dev) From 59d40dd92ca00df317ad44fa44c27fe38c58e477 Mon Sep 17 00:00:00 2001 From: Carol L Soto Date: Thu, 11 Jun 2015 11:06:41 -0500 Subject: [PATCH 15/24] IB/ucm: Fix bitmap wrap when devnum > IB_UCM_MAX_DEVICES ib_ucm_release_dev clears the wrong bit if devnum is greater than IB_UCM_MAX_DEVICES. Signed-off-by: Carol L Soto Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 62c24b1452b8..009481073644 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1193,6 +1193,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1202,7 +1203,7 @@ static void ib_ucm_release_dev(struct device *dev) if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) clear_bit(ucm_dev->devnum, dev_map); else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); kfree(ucm_dev); } @@ -1226,7 +1227,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static int find_overflow_devnum(void) { int ret; From c42687784b9a6b9733fee701ed236a5fe088fac4 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 12 Jul 2015 01:24:09 -0700 Subject: [PATCH 16/24] IB/ipoib: Scatter-Gather support in connected mode By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance. This MTU plus overhead puts the memory allocation for IP based packets at 32 4k pages (order 5), which have to be contiguous. When the system memory under pressure, it was observed that allocating 128k contiguous physical memory is difficult and causes serious errors (such as system becomes unusable). This enhancement resolve the issue by removing the physically contiguous memory requirement using Scatter/Gather feature that exists in Linux stack. With this fix Scatter-Gather will be supported also in connected mode. This change reverts some of the change made in commit e112373fd6aa ("IPoIB/cm: Reduce connected mode TX object size"). The ability to use SG in IPoIB CM is possible because the coupling between NETIF_F_SG and NETIF_F_CSUM was removed in commit ec5f06156423 ("net: Kill link between CSUM and SG features.") Signed-off-by: Yuval Shaia Acked-by: Christian Marie Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib.h | 29 +++++++++++++++++- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 33 +++++++++------------ drivers/infiniband/ulp/ipoib/ipoib_ib.c | 36 +++++++---------------- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 4 files changed, 54 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index bd94b0a6e9e5..79859c4d43c9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -239,7 +239,7 @@ struct ipoib_cm_tx { struct net_device *dev; struct ipoib_neigh *neigh; struct ipoib_path *path; - struct ipoib_cm_tx_buf *tx_ring; + struct ipoib_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; @@ -504,6 +504,33 @@ int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); +void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req); + +static inline void ipoib_build_sge(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req) +{ + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; + + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); + } + priv->tx_wr.num_sge = nr_frags + off; +} + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index cf32a778e7d0..ee39be6ccfb0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -694,14 +694,12 @@ repost: static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx, unsigned int wr_id, - u64 addr, int len) + struct ipoib_tx_buf *tx_req) { struct ib_send_wr *bad_wr; - priv->tx_sge[0].addr = addr; - priv->tx_sge[0].length = len; + ipoib_build_sge(priv, tx_req); - priv->tx_wr.num_sge = 1; priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); @@ -710,8 +708,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_cm_tx_buf *tx_req; - u64 addr; + struct ipoib_tx_buf *tx_req; int rc; if (unlikely(skb->len > tx->mtu)) { @@ -735,24 +732,21 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ */ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; tx_req->skb = skb; - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { ++dev->stats.tx_errors; dev_kfree_skb_any(skb); return; } - tx_req->mapping = addr; - skb_orphan(skb); skb_dst_drop(skb); - rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), - addr, skb->len); + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; - ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); } else { dev->trans_start = jiffies; @@ -777,7 +771,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; - struct ipoib_cm_tx_buf *tx_req; + struct ipoib_tx_buf *tx_req; unsigned long flags; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", @@ -791,7 +785,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) tx_req = &tx->tx_ring[wr_id]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + ipoib_dma_unmap_tx(priv, tx_req); /* FIXME: is this right? Shouldn't we only increment on success? */ ++dev->stats.tx_packets; @@ -1036,6 +1030,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ struct ib_qp *tx_qp; + if (dev->features & NETIF_F_SG) + attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + tx_qp = ib_create_qp(priv->pd, &attr); if (PTR_ERR(tx_qp) == -EINVAL) { ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", @@ -1170,7 +1167,7 @@ err_tx: static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = netdev_priv(p->dev); - struct ipoib_cm_tx_buf *tx_req; + struct ipoib_tx_buf *tx_req; unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", @@ -1197,8 +1194,7 @@ timeout: while ((int) p->tx_tail - (int) p->tx_head < 0) { tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, - DMA_TO_DEVICE); + ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(tx_req->skb); ++p->tx_tail; netif_tx_lock_bh(p->dev); @@ -1455,7 +1451,6 @@ static void ipoib_cm_stale_task(struct work_struct *work) spin_unlock_irq(&priv->lock); } - static ssize_t show_mode(struct device *d, struct device_attribute *attr, char *buf) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 058150ff5aa1..d266667ca9b8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -263,8 +263,7 @@ repost: "for buf %d\n", wr_id); } -static int ipoib_dma_map_tx(struct ib_device *ca, - struct ipoib_tx_buf *tx_req) +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) { struct sk_buff *skb = tx_req->skb; u64 *mapping = tx_req->mapping; @@ -305,8 +304,8 @@ partial_error: return -EIO; } -static void ipoib_dma_unmap_tx(struct ib_device *ca, - struct ipoib_tx_buf *tx_req) +void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req) { struct sk_buff *skb = tx_req->skb; u64 *mapping = tx_req->mapping; @@ -314,7 +313,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, int off; if (skb_headlen(skb)) { - ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb), + DMA_TO_DEVICE); off = 1; } else off = 0; @@ -322,8 +322,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), - DMA_TO_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i + off], + skb_frag_size(frag), DMA_TO_DEVICE); } } @@ -389,7 +389,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) tx_req = &priv->tx_ring[wr_id]; - ipoib_dma_unmap_tx(priv->ca, tx_req); + ipoib_dma_unmap_tx(priv, tx_req); ++dev->stats.tx_packets; dev->stats.tx_bytes += tx_req->skb->len; @@ -514,24 +514,10 @@ static inline int post_send(struct ipoib_dev_priv *priv, void *head, int hlen) { struct ib_send_wr *bad_wr; - int i, off; struct sk_buff *skb = tx_req->skb; - skb_frag_t *frags = skb_shinfo(skb)->frags; - int nr_frags = skb_shinfo(skb)->nr_frags; - u64 *mapping = tx_req->mapping; - if (skb_headlen(skb)) { - priv->tx_sge[0].addr = mapping[0]; - priv->tx_sge[0].length = skb_headlen(skb); - off = 1; - } else - off = 0; + ipoib_build_sge(priv, tx_req); - for (i = 0; i < nr_frags; ++i) { - priv->tx_sge[i + off].addr = mapping[i + off]; - priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); - } - priv->tx_wr.num_sge = nr_frags + off; priv->tx_wr.wr_id = wr_id; priv->tx_wr.wr.ud.remote_qpn = qpn; priv->tx_wr.wr.ud.ah = address; @@ -617,7 +603,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; --priv->tx_outstanding; - ipoib_dma_unmap_tx(priv->ca, tx_req); + ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); if (netif_queue_stopped(dev)) netif_wake_queue(dev); @@ -868,7 +854,7 @@ int ipoib_ib_dev_stop(struct net_device *dev) while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & (ipoib_sendq_size - 1)]; - ipoib_dma_unmap_tx(priv->ca, tx_req); + ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(tx_req->skb); ++priv->tx_tail; --priv->tx_outstanding; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index da3e7e7d5e80..59604a7e5e3e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu struct ipoib_dev_priv *priv = netdev_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) - features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); return features; } From edcd2a7474ba3b47e54c3c9a300287342de74766 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sun, 7 Jun 2015 13:36:11 +0300 Subject: [PATCH 17/24] IB/ipoib: Set MTU to max allowed by mode when mode changes When switching between modes (datagram / connected) change the MTU accordingly. datagram mode up to 4K, connected mode up to (64K - 0x10). Signed-off-by: ELi Cohen Signed-off-by: Erez Shitrit Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 59604a7e5e3e..b2943c84a5dd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -232,6 +232,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); netdev_update_features(dev); + dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); rtnl_unlock(); priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; From cb1ff431c3dd904cda37d6d07d4a3ea29840d621 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Tue, 16 Jun 2015 17:13:05 +0530 Subject: [PATCH 18/24] IB/ipath: Convert use of __constant_ to In little endian cases, the macros be16_to_cpu and cpu_to_be64 unfolds to __swab{16,64} which provides special case for constants. In big endian cases, __constant_be16_to_cpu and be16_to_cpu expand directly to the same expression. The same applies for __constant_cpu_to_be64 and cpu_to_be64. So, replace __constant_be16_to_cpu with be16_to_cpu and __constant_cpu_to_be64 with cpu_to_be64, with the goal of getting rid of the definition of __constant_be16_to_cpu and __constant_cpu_to_be64 completely. Signed-off-by: Vaishali Thakkar Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ipath/ipath_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 48253b839a6f..30ba49c4a98c 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -2044,9 +2044,9 @@ int ipath_register_ib_device(struct ipath_devdata *dd) spin_lock_init(&idev->qp_table.lock); spin_lock_init(&idev->lk_table.lock); - idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE); + idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); /* Set the prefix to the default value (see ch. 4.1.1) */ - idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL); + idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL); ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size); if (ret) From 43bfb9729ea88d46e3f4d3ad7b17106c7b071fcb Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 25 Jun 2015 17:45:38 +0300 Subject: [PATCH 19/24] IB/mlx4: Fix use of flow-counters for process_mad For IB links, reading HCA flow counters through iboe_process_mad() should be used when mlx4_ib_process_mad() is invoked only for VFs PMA queries and exactly nothing else. Fixes: 7193a141eb74 ('IB/mlx4: Set VF to read from QP counters') Reported-by: Linus Torvalds Signed-off-by: Or Gerlitz Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 9a810e302f6b..68b3dfa922bf 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -860,22 +860,31 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct mlx4_ib_dev *dev = to_mdev(ibdev); const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; + enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num); if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || *out_mad_size != sizeof(*out_mad))) return IB_MAD_RESULT_FAILURE; - switch (rdma_port_get_link_layer(ibdev, port_num)) { - case IB_LINK_LAYER_INFINIBAND: - if (!mlx4_is_slave(dev->dev)) - return ib_process_mad(ibdev, mad_flags, port_num, in_wc, - in_grh, in_mad, out_mad); - case IB_LINK_LAYER_ETHERNET: - return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, - in_grh, in_mad, out_mad); - default: - return -EINVAL; + /* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA + * queries, should be called only by VFs and for that specific purpose + */ + if (link == IB_LINK_LAYER_INFINIBAND) { + if (mlx4_is_slave(dev->dev) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS) + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + + return ib_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); } + + if (link == IB_LINK_LAYER_ETHERNET) + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + + return -EINVAL; } static void send_handler(struct ib_mad_agent *agent, From a39a98ff4cc8b514fe6fa551f6ed59cd60e07da2 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 8 Jul 2015 09:43:35 +0530 Subject: [PATCH 20/24] IB/mlx4: Optimize freeing of items on error unwind On failure, we loop through all possible pointers and test them before calling kfree. But really, why even attempt to free items we didn't allocate when we can easily loop through exactly and only the devices for which the original memory allocation succeeded and free just those. Signed-off-by: Maninder Singh Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f419a728a22b..064454aee863 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2670,17 +2670,15 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); - goto out; + return; } for (i = 0; i < ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); - for (i = 0; i < dev->caps.num_ports; i++) { - if (dm[i]) - kfree(dm[i]); - } + while (--i >= 0) + kfree(dm[i]); goto out; } } From 9bbf282da87294e1bda0ccb4e351bfdf5fc076cd Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 9 Jul 2015 10:16:12 -0400 Subject: [PATCH 21/24] IB/mlx4: Fix memory leak in do_slave_init We create a number of work structs to be queued up to a workqueue, and on completion of the workqueue handler, the workqueue handler frees the allocated memory. If, however, we don't queue the work struct because the device is going down, then we need to free the memory ourselves. Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 064454aee863..1c59e4749736 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2692,6 +2692,8 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + else + kfree(dm[i]); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: From d9a047aeffcef5755952d18f2901d8777d84019d Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 9 Jul 2015 10:21:08 -0400 Subject: [PATCH 22/24] IB/mlx4: Optimize do_slave_init There is little chance our memory allocation will fail, so we can combine initializing the work structs with allocating them instead of looping through all of them once to allocate and again to initialize. Then when we need to actually find out if our device is up or in the process of going down, have all of our work structs batched up, take the spin_lock once and only once, and do all of the batch under the one spin_lock invocation instead of incurring all of the locked memory cycles we would otherwise incur to take/release the spin_lock over and over again. Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c59e4749736..8be6db816460 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2681,20 +2681,22 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) kfree(dm[i]); goto out; } - } - /* initialize or tear down tunnel QPs for the slave */ - for (i = 0; i < ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = first_port + i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; - spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); - if (!ibdev->sriov.is_going_down) + } + /* initialize or tear down tunnel QPs for the slave */ + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) { + for (i = 0; i < ports; i++) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); - else - kfree(dm[i]); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } else { + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + for (i = 0; i < ports; i++) + kfree(dm[i]); } out: kfree(dm); From 45d254206f141f560e76319191f912d0a3d27bd3 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 8 Jul 2015 17:21:15 +0200 Subject: [PATCH 23/24] IB/core: Destroy multcast_idr on module exit Destroy multcast_idr on module exit, reclaiming the allocated memory. This was detected by the following semantic patch (written by Luis Rodriguez ) @ defines_module_init @ declarer name module_init, module_exit; declarer name DEFINE_IDR; identifier init; @@ module_init(init); @ defines_module_exit @ identifier exit; @@ module_exit(exit); @ declares_idr depends on defines_module_init && defines_module_exit @ identifier idr; @@ DEFINE_IDR(idr); @ on_exit_calls_destroy depends on declares_idr && defines_module_exit @ identifier declares_idr.idr, defines_module_exit.exit; @@ exit(void) { ... idr_destroy(&idr); ... } @ missing_module_idr_destroy depends on declares_idr && defines_module_exit && !on_exit_calls_destroy @ identifier declares_idr.idr, defines_module_exit.exit; @@ exit(void) { ... +idr_destroy(&idr); } Signed-off-by: Johannes Thumshirn Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index a700cdb13f3d..29b21213ea75 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1616,6 +1616,7 @@ static void __exit ucma_cleanup(void) device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); + idr_destroy(&multicast_idr); } module_init(ucma_init); From d8b2ba7c5928173fe1c12bd2545f5ed85d1c3c7a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 8 Jul 2015 17:23:00 +0200 Subject: [PATCH 24/24] IB/core: Destroy ocrdma_dev_id IDR on module exit Destroy ocrdma_dev_id IDR on module exit, reclaiming the allocated memory. This was detected by the following semantic patch (written by Luis Rodriguez ) @ defines_module_init @ declarer name module_init, module_exit; declarer name DEFINE_IDR; identifier init; @@ module_init(init); @ defines_module_exit @ identifier exit; @@ module_exit(exit); @ declares_idr depends on defines_module_init && defines_module_exit @ identifier idr; @@ DEFINE_IDR(idr); @ on_exit_calls_destroy depends on declares_idr && defines_module_exit @ identifier declares_idr.idr, defines_module_exit.exit; @@ exit(void) { ... idr_destroy(&idr); ... } @ missing_module_idr_destroy depends on declares_idr && defines_module_exit && !on_exit_calls_destroy @ identifier declares_idr.idr, defines_module_exit.exit; @@ exit(void) { ... +idr_destroy(&idr); } Signed-off-by: Johannes Thumshirn Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 8a1398b253a2..d98a707a5eb9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -696,6 +696,7 @@ static void __exit ocrdma_exit_module(void) ocrdma_unregister_inet6addr_notifier(); ocrdma_unregister_inetaddr_notifier(); ocrdma_rem_debugfs(); + idr_destroy(&ocrdma_dev_id); } module_init(ocrdma_init_module);