From edf3f301db7af7e784d06f7059dfc8a69359af13 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 10 Jul 2017 18:45:41 +0300 Subject: [PATCH 01/16] IB/ipoib: Fix race between light events and interface restart A potential race between light_event and interface restart may attach multicast group to an already attached QP. Scenario: light_event flow goes through ipoib_mcast_dev_flush function, if a context switch occurs before calling ipoib_mcast_remove_list, then we may face a situation where the broadcast of the priv is null and the corresponding QP is not detached yet. If an "interface restart" runs during the previous context switch, the following scenario occurs: When the device goes up, ipoib_ib_dev_up function will be called, it will send a new registration request to the broadcast group and then attach the group to the QP that was not detached before. IPOIB_FLUSH_LIGHT INTERFACE RESTART __ipoib_ib_dev_flush | | | | | | | ipoib_mcast_dev_flush | Move mcast list and broadcast to remove_list | | | | | Context Switch--> | | ipoib_ib_dev_down | | | | | ipoib_ib_dev_up | | | | | ipoib_mcast_join_task | allocate new broadcast | | | | | Attach QP to multicast group | | | | | <--Context Switch ipoib_mcast_leave Detach QP from multicast group Signed-off-by: Feras Daoud Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 + drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ff50a7bd66d8..7ac25059c40f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -336,6 +336,7 @@ struct ipoib_dev_priv { unsigned long flags; struct rw_semaphore vlan_rwsem; + struct mutex mcast_mutex; struct rb_root path_tree; struct list_head path_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 4ce315c92b48..144187b407bd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1877,6 +1877,7 @@ static void ipoib_build_priv(struct net_device *dev) priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); + mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 057f58e6afca..0a0b2ce45cbc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -838,6 +838,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev) struct ipoib_mcast *mcast, *tmcast; unsigned long flags; + mutex_lock(&priv->mcast_mutex); ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); @@ -865,6 +866,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev) wait_for_completion(&mcast->done); ipoib_mcast_remove_list(&remove_list); + mutex_unlock(&priv->mcast_mutex); } static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) From 6bdc8de2e86e717124a715ecc480892a2c331ff5 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 12 Jul 2017 10:40:25 +0300 Subject: [PATCH 02/16] IB/ipoib: Use cancel_delayed_work_sync when needed The work mcast_task can re-queue itself, so instead of doing cancel && flush_workqueue, that still can leave a queued task on the air, use cancel_delayed_work_sync. Also, no need to use lock over the cancel, the original lock was due to bit assignment setting (IPOIB_MCAST_RUN) that is not in use anymore. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 0a0b2ce45cbc..f80bf0f5d7cf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -684,15 +684,10 @@ void ipoib_mcast_start_thread(struct net_device *dev) int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - spin_lock_irqsave(&priv->lock, flags); - cancel_delayed_work(&priv->mcast_task); - spin_unlock_irqrestore(&priv->lock, flags); - - flush_workqueue(priv->wq); + cancel_delayed_work_sync(&priv->mcast_task); return 0; } From a08e1120627f72e9ed7c291e3b9f8dd29c1513ab Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 12 Jul 2017 13:11:54 +0300 Subject: [PATCH 03/16] IB/ipoib: Make sure no in-flight joins while leaving that mcast While cleaning neighs and there is a send-only mcast neigh, the driver should wait to finish its join process before trying to remove it. Without this patch, we will see messages like: "ipoib_mcast_leave on an in-flight join" and unexpected results in the join_complete. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- .../infiniband/ulp/ipoib/ipoib_multicast.c | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index f80bf0f5d7cf..93e149efc1f5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -743,6 +743,14 @@ void ipoib_mcast_remove_list(struct list_head *remove_list) { struct ipoib_mcast *mcast, *tmcast; + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); @@ -852,14 +860,6 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* - * make sure the in-flight joins have finished before we attempt - * to leave - */ - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - wait_for_completion(&mcast->done); - ipoib_mcast_remove_list(&remove_list); mutex_unlock(&priv->mcast_mutex); } @@ -979,14 +979,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* - * make sure the in-flight joins have finished before we attempt - * to leave - */ - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - wait_for_completion(&mcast->done); - ipoib_mcast_remove_list(&remove_list); /* From 11f74b40359b19f760964e71d04882a6caf530cc Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 13 Jul 2017 11:27:12 +0300 Subject: [PATCH 04/16] IB/ipoib: Prevent setting negative values to max_nonsrq_conn_qp Don't allow negative values to max_nonsrq_conn_qp. There is no functional impact on a negative value but it is logicically incorrect. Fixes: 68e995a29572 ("IPoIB/cm: Add connected mode support for devices without SRQs") Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 144187b407bd..8b7ec15a9d6e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2366,6 +2366,7 @@ static int __init ipoib_init_module(void) ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); + ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* From d2e46fccc3e3d73a741efe433f00960331280696 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Sun, 16 Jul 2017 11:33:01 +0300 Subject: [PATCH 05/16] IB/ipoib: Set IPOIB_NEIGH_TBL_FLUSH after flushed completion initialization Set IPOIB_NEIGH_TBL_FLUSH bit after initializing the neighbor flushed completion, otherwise the garbage collector may signal a completion while it is not initialized yet. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path") Signed-off-by: Feras Daoud Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8b7ec15a9d6e..f4403c52cd67 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1560,6 +1560,7 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); @@ -1604,7 +1605,6 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev) ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); init_completion(&priv->ntbl.deleted); - set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); /* Stop GC if called at init fail need to cancel work */ stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); From 4829d964dfb027558c732cfa0d13b716ab3f0838 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Mon, 10 Jul 2017 18:12:43 +0300 Subject: [PATCH 06/16] IB/ipoib: Add multicast packets statistics Update the multicast counter when multicast packets are received and provide this information through ethtool support. Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 3 ++- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7871379342f4..184a22f48027 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -52,7 +52,8 @@ static const struct ipoib_stats ipoib_gstrings_stats[] = { IPOIB_NETDEV_STAT(tx_bytes), IPOIB_NETDEV_STAT(tx_errors), IPOIB_NETDEV_STAT(rx_dropped), - IPOIB_NETDEV_STAT(tx_dropped) + IPOIB_NETDEV_STAT(tx_dropped), + IPOIB_NETDEV_STAT(multicast), }; #define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 57a9655e844d..02eda1f53a67 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -256,6 +256,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; + if (skb->pkt_type == PACKET_MULTICAST) + dev->stats.multicast++; skb->dev = dev; if ((dev->features & NETIF_F_RXCSUM) && From eb54714ddcb2462d4d4b8aa78d028b61e217a835 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Sun, 2 Jul 2017 15:05:59 +0300 Subject: [PATCH 07/16] IB/ipoib: Add get statistics support to SRIOV VF Add SRIOV VF support to get traffic statistics. Signed-off-by: Feras Daoud Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index f4403c52cd67..24fa87fe0952 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1847,6 +1847,7 @@ static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, + .ndo_get_stats64 = ipoib_get_stats, }; void ipoib_setup_common(struct net_device *dev) From dc892e17bbae670a3d7aa6ab8bd1033b15b24645 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 13 Jul 2017 13:34:19 +0300 Subject: [PATCH 08/16] IB/ipoib: Clean error paths in add port Refactor error paths in ipoib_add_port() function. The code flow ensures that the function terminates on every error flow and it makes redundant all "else" cases. The functions are called during the flow are returning "result < 0", in case of error, so there is no need to check it explicitly. Fixes: 58e9cc90cda7 ("IB/IPoIB: Fix bad error flow in ipoib_add_port()") Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 24fa87fe0952..6c77df34869d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2175,14 +2175,14 @@ static struct net_device *ipoib_add_port(const char *format, priv->dev->dev_id = port - 1; result = ib_query_port(hca, port, &attr); - if (!result) - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); - else { + if (result) { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; @@ -2213,12 +2213,14 @@ static struct net_device *ipoib_add_port(const char *format, printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; - } else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + } + + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, + sizeof(union ib_gid)); set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); result = ipoib_dev_init(priv->dev, hca, port); - if (result < 0) { + if (result) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; From 1b355094b308f3377c8f574ce86135ee159c6285 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 15 Jul 2017 16:26:55 +0300 Subject: [PATCH 09/16] IB/ipoib: Remove double pointer assigning There is no need to assign "p" pointer twice. This patch fixes the following smatch warning: drivers/infiniband/ulp/ipoib/ipoib_cm.c:517 ipoib_cm_rx_handler() warn: missing break? reassigning 'p->id' Fixes: 839fcaba355a ("IPoIB: Connected mode experimental support") Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index f87d104837dc..d69410c2ed97 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -511,7 +511,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, case IB_CM_REQ_RECEIVED: return ipoib_cm_req_handler(cm_id, event); case IB_CM_DREQ_RECEIVED: - p = cm_id->context; ib_send_cm_drep(cm_id, NULL, 0); /* Fall through */ case IB_CM_REJ_RECEIVED: From b287b76e89503ef1d403cc5cc8bd74b035d25bfa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jul 2017 10:46:14 +0300 Subject: [PATCH 10/16] Revert "IB/core: Allow QP state transition from reset to error" The commit ebc9ca43e1d5 ("IB/core: Allow QP state transition from reset to error") allowed transition from Reset to Error state for the QPs. This behavior doesn't follow the IBTA specification 1.3, which in 10.3.1 QUEUE PAIR AND EE CONTEXT STATES section. The quote from the spec: "An error can be forced from any state, except Reset, with the Modify QP/EE Verb." Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index fb98ed67d5bc..7f8fe443df46 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -895,7 +895,6 @@ static const struct { } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { From 5dc78ad1904db597bdb4427f3ead437aae86f54c Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 13 Jul 2017 14:29:08 +0300 Subject: [PATCH 11/16] IB/ipoib: Notify on modify QP failure only when relevant Modify QP can fail and it can be acceptable, like when moving from RST to ERR state, all the rest are not acceptable and a message to the log should be printed. The current code prints on all failures and many messages like: "Failed to modify QP to ERROR state" appear, even when supported by the state machine of the QP object. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 02eda1f53a67..2e075377242e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -711,6 +711,27 @@ static int recvs_pending(struct net_device *dev) return pending; } +static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, + struct ib_qp *qp, + enum ib_qp_state new_state) +{ + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP\n", __func__); + return; + } + /* print according to the new-state and the previous state.*/ + if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) + ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n"); + else + ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n", + new_state, qp_attr.qp_state); +} + int ipoib_ib_dev_stop_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -730,7 +751,7 @@ int ipoib_ib_dev_stop_default(struct net_device *dev) */ qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) - ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR); /* Wait for all sends and receives to complete */ begin = jiffies; From 5fff41e1f89d93feef9833c49a415dc337af5a99 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 1 Aug 2017 09:41:34 +0300 Subject: [PATCH 12/16] IB/core: Fix race condition in resolving IP to MAC Currently while resolving IP address to MAC address single delayed work is used for resolving multiple such resolve requests. This singled work is essentially performs two tasks. (a) any retry needed to resolve and (b) it executes the callback function for all completed requests While work is executing callbacks, any new work scheduled on for this workqueue is lost because workqueue has completed looking at all pending requests and now looking at callbacks, but work is still under execution. Any further retry to look at pending requests in process_req() after executing callbacks would lead to similar race condition (may be reduce the probably further but doesn't eliminate it). Retrying to enqueue work that from queue_req() context is not something rest of the kernel modules have followed. Therefore fix in this patch utilizes kernel facility to enqueue multiple work items to a workqueue. This ensures that no such requests gets lost in synchronization. Request list is still maintained so that rdma_cancel_addr() can unlink the request and get the completion with error sooner. Neighbour update event handling continues to be handled in same way as before. Additionally process_req() work entry cancels any pending work for a request that gets completed while processing those requests. Originally ib_addr was ST workqueue, but it became MT work queue with patch of [1]. This patch again makes it similar to ST so that neighbour update events handler work item doesn't race with other work items. In one such below trace, (though on 4.5 based kernel) it can be seen that process_req() never executed the callback, which is likely for an event that was schedule by queue_req() when previous callback was getting executed by workqueue. [] schedule+0x3e/0x90 [] schedule_timeout+0x1b5/0x210 [] ? ip_route_output_flow+0x27/0x70 [] ? addr_resolve+0x149/0x1b0 [ib_addr] [] wait_for_completion+0x10f/0x170 [] ? try_to_wake_up+0x210/0x210 [] ? rdma_copy_addr+0xa0/0xa0 [ib_addr] [] rdma_addr_find_l2_eth_by_grh+0x1d0/0x278 [ib_addr] [] ? sub_alloc+0x77/0x1c0 [] ib_init_ah_from_wc+0x3a7/0x5a0 [ib_core] [] cm_req_handler+0xea/0x580 [ib_cm] [] ? __switch_to+0x212/0x5e0 [] cm_work_handler+0x6d/0x150 [ib_cm] [] process_one_work+0x151/0x4b0 [] worker_thread+0x120/0x480 [] ? __schedule+0x30b/0x890 [] ? process_one_work+0x4b0/0x4b0 [] ? process_one_work+0x4b0/0x4b0 [] kthread+0xce/0xf0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x42/0x70 [] ? kthread_freezable_should_stop+0x70/0x70 INFO: task kworker/u144:1:156520 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u144:1 D ffff883ffe1d7600 0 156520 2 0x00000080 Workqueue: ib_addr process_req [ib_addr] ffff883f446fbbd8 0000000000000046 ffff881f95280000 ffff881ff24de200 ffff883f66120000 ffff883f446f8008 ffff881f95280000 ffff883f6f9208c4 ffff883f6f9208c8 00000000ffffffff ffff883f446fbbf8 ffffffff816b0dde [1] http://lkml.iu.edu/hypermail/linux/kernel/1608.1/05834.html Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 62 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 01236cef7bfb..437522ca97b4 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -61,6 +61,7 @@ struct addr_req { void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); unsigned long timeout; + struct delayed_work work; int status; u32 seq; }; @@ -295,7 +296,7 @@ int rdma_translate_ip(const struct sockaddr *addr, } EXPORT_SYMBOL(rdma_translate_ip); -static void set_timeout(unsigned long time) +static void set_timeout(struct delayed_work *delayed_work, unsigned long time) { unsigned long delay; @@ -303,7 +304,7 @@ static void set_timeout(unsigned long time) if ((long)delay < 0) delay = 0; - mod_delayed_work(addr_wq, &work, delay); + mod_delayed_work(addr_wq, delayed_work, delay); } static void queue_req(struct addr_req *req) @@ -318,8 +319,7 @@ static void queue_req(struct addr_req *req) list_add(&req->list, &temp_req->list); - if (req_list.next == &req->list) - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); mutex_unlock(&lock); } @@ -574,6 +574,37 @@ static int addr_resolve(struct sockaddr *src_in, return ret; } +static void process_one_req(struct work_struct *_work) +{ + struct addr_req *req; + struct sockaddr *src_in, *dst_in; + + mutex_lock(&lock); + req = container_of(_work, struct addr_req, work.work); + + if (req->status == -ENODATA) { + src_in = (struct sockaddr *)&req->src_addr; + dst_in = (struct sockaddr *)&req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true, req->seq); + if (req->status && time_after_eq(jiffies, req->timeout)) { + req->status = -ETIMEDOUT; + } else if (req->status == -ENODATA) { + /* requeue the work for retrying again */ + set_timeout(&req->work, req->timeout); + mutex_unlock(&lock); + return; + } + } + list_del(&req->list); + mutex_unlock(&lock); + + req->callback(req->status, (struct sockaddr *)&req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); +} + static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; @@ -591,20 +622,23 @@ static void process_req(struct work_struct *work) true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) + else if (req->status == -ENODATA) { + set_timeout(&req->work, req->timeout); continue; + } } list_move_tail(&req->list, &done_list); } - if (!list_empty(&req_list)) { - req = list_entry(req_list.next, struct addr_req, list); - set_timeout(req->timeout); - } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); + /* It is safe to cancel other work items from this work item + * because at a time there can be only one work item running + * with this single threaded work queue. + */ + cancel_delayed_work(&req->work); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); @@ -647,6 +681,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); @@ -701,7 +736,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); break; } } @@ -807,9 +842,8 @@ static int netevent_callback(struct notifier_block *self, unsigned long event, if (event == NETEVENT_NEIGH_UPDATE) { struct neighbour *neigh = ctx; - if (neigh->nud_state & NUD_VALID) { - set_timeout(jiffies); - } + if (neigh->nud_state & NUD_VALID) + set_timeout(&work, jiffies); } return 0; } @@ -820,7 +854,7 @@ static struct notifier_block nb = { int addr_init(void) { - addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); + addr_wq = alloc_ordered_workqueue("ib_addr", WQ_MEM_RECLAIM); if (!addr_wq) return -ENOMEM; From f7a6cb7b38c6845b26aaa8bbdf519ff6e3090831 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:35 +0300 Subject: [PATCH 13/16] RDMA/uverbs: Prevent leak of reserved field initialize to zero the response structure to prevent the leakage of "resp.reserved" field. drivers/infiniband/core/uverbs_cmd.c:1178 ib_uverbs_resize_cq() warn: check that 'resp.reserved' doesn't leak information Fixes: 33b9b3ee9709 ("IB: Add userspace support for resizing CQs") Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2c98533a0203..c551d2b275fd 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1153,7 +1153,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_resize_cq cmd; - struct ib_uverbs_resize_cq_resp resp; + struct ib_uverbs_resize_cq_resp resp = {}; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; From efdd6f53b10aead0f5cf19a93dd3eb268ac0d991 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 1 Aug 2017 09:41:36 +0300 Subject: [PATCH 14/16] IB/uverbs: Fix device cleanup Uverbs device should be cleaned up only when there is no potential usage of. As part of ib_uverbs_remove_one which might be triggered upon reset flow the device reference count is decreased as expected and leave the final cleanup to the FDs that were opened. Current code increases reference count upon opening a new command FD and decreases it upon closing the file. The event FD is opened internally and rely on the command FD by taking on it a reference count. In case that the command FD was closed and just later the event FD we may ensure that the device resources as of srcu are still alive as they are still in use. Fixing the above by moving the reference count decreasing to the place where the command FD is really freed instead of doing that when it was just closed. fixes: 036b10635739 ("IB/uverbs: Enable device removal when there are active user space applications") Signed-off-by: Yishai Hadas Reviewed-by: Matan Barak Reviewed-by: Jason Gunthorpe Tested-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 3d2609608f58..c023e2c81b8f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -250,6 +250,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); + kobject_put(&file->device->kobj); kfree(file); } @@ -917,7 +918,6 @@ err: static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; - struct ib_uverbs_device *dev = file->device; mutex_lock(&file->cleanup_mutex); if (file->ucontext) { @@ -939,7 +939,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) ib_uverbs_release_async_event_file); kref_put(&file->ref, ib_uverbs_release_file); - kobject_put(&dev->kobj); return 0; } From 931b3c1a832621b4bdcbaf783096fc267eb36fbe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:37 +0300 Subject: [PATCH 15/16] RDMA/mlx5: Fix existence check for extended address vector The extended address vector is the highest bit in be32 variable, but it was compared with the lowest. This patch fixes the endianness of that check and removes already declared define. Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support") Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/linux/mlx5/qp.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index ae0746754008..3d701c7a4c91 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (qp->ibqp.qp_type != IB_QPT_RC) { av = *wqe; - if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) *wqe += sizeof(struct mlx5_av); else *wqe += sizeof(struct mlx5_base_av); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6f41270d80c0..f378dc0e7eaf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 -#define MLX5_WQE_AV_EXT 0x80000000 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, From 5db465f235e74293e285e1fa924a55e52ba52a98 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Aug 2017 11:12:08 +0300 Subject: [PATCH 16/16] IB/hns: checking for IS_ERR() instead of NULL The hns_roce_v1_create_lp_qp() returns NULL on error, not error pointers. Fixes: bfcc681bd09d ("IB/hns: Fix the bug when free mr") Signed-off-by: Dan Carpenter Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 23fad6d96944..2540b65e242c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -733,7 +733,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) continue; free_mr->mr_free_qp[i] = hns_roce_v1_create_lp_qp(hr_dev, pd); - if (IS_ERR(free_mr->mr_free_qp[i])) { + if (!free_mr->mr_free_qp[i]) { dev_err(dev, "Create loop qp failed!\n"); goto create_lp_qp_failed; }