From 042f36e1560cedfe524607791fa44607a3121f63 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 8 Nov 2011 13:27:31 -0500 Subject: [PATCH 1/4] IB/qib: Don't use schedule_work() It was mistakenly introduced by dde05cbdf8b1 ("IB/qib: Hold links until tuning data is available"). Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 5bd2162b95dc..8b46b608c02f 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -5241,7 +5241,7 @@ static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) off */ if (ppd->dd->flags & QIB_HAS_QSFP) { qd->t_insert = get_jiffies_64(); - schedule_work(&qd->work); + queue_work(ib_wq, &qd->work); } spin_lock_irqsave(&ppd->sdma_lock, flags); if (__qib_sdma_running(ppd)) From 8ee887d74b3d741991edaa1836d22636c28926d9 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 9 Nov 2011 17:07:22 -0500 Subject: [PATCH 2/4] IB/qib: Fix over-scheduling of QSFP work Don't over-schedule QSFP work on driver initialization. It could end up being run simultaneously on two different CPUs resulting in bad EEPROM reads. In combination with setting the physical IB link state prior to the IBC being brought out of reset, this can cause the link state machine to start training early with wrong settings. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 16 ++++++++-------- drivers/infiniband/hw/qib/qib_qsfp.c | 12 ------------ 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 8b46b608c02f..1d5895941e19 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2307,19 +2307,11 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) SYM_LSB(IBCCtrlA_0, MaxPktLen); ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */ - /* initially come up waiting for TS1, without sending anything. */ - val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << - QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); - - ppd->cpspec->ibcctrl_a = val; /* * Reset the PCS interface to the serdes (and also ibc, which is still * in reset from above). Writes new value of ibcctrl_a as last step. */ qib_7322_mini_pcs_reset(ppd); - qib_write_kreg(dd, kr_scratch, 0ULL); - /* clear the linkinit cmds */ - ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, LinkInitCmd); if (!ppd->cpspec->ibcctrl_b) { unsigned lse = ppd->link_speed_enabled; @@ -2385,6 +2377,14 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn); set_vls(ppd); + /* initially come up DISABLED, without sending anything. */ + val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg_port(ppd, krp_ibcctrl_a, val); + qib_write_kreg(dd, kr_scratch, 0ULL); + /* clear the linkinit cmds */ + ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd); + /* be paranoid against later code motion, etc. */ spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable); diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c index e06c4ed383f1..fa71b1e666c5 100644 --- a/drivers/infiniband/hw/qib/qib_qsfp.c +++ b/drivers/infiniband/hw/qib/qib_qsfp.c @@ -480,18 +480,6 @@ void qib_qsfp_init(struct qib_qsfp_data *qd, udelay(20); /* Generous RST dwell */ dd->f_gpio_mod(dd, mask, mask, mask); - /* Spec says module can take up to two seconds! */ - mask = QSFP_GPIO_MOD_PRS_N; - if (qd->ppd->hw_pidx) - mask <<= QSFP_GPIO_PORT2_SHIFT; - - /* Do not try to wait here. Better to let event handle it */ - if (!qib_qsfp_mod_present(qd->ppd)) - goto bail; - /* We see a module, but it may be unwise to look yet. Just schedule */ - qd->t_insert = get_jiffies_64(); - queue_work(ib_wq, &qd->work); -bail: return; } From 3874397c0bdec3c21ce071711cd105165179b8eb Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 21 Nov 2011 08:43:54 -0500 Subject: [PATCH 3/4] IB/ipoib: Prevent hung task or softlockup processing multicast response This following can occur with ipoib when processing a multicast reponse: BUG: soft lockup - CPU#0 stuck for 67s! [ib_mad1:982] Modules linked in: ... CPU 0: Modules linked in: ... Pid: 982, comm: ib_mad1 Not tainted 2.6.32-131.0.15.el6.x86_64 #1 ProLiant DL160 G5 RIP: 0010:[] [] _spin_unlock_irqrestore+0x17/0x20 RSP: 0018:ffff8802119ed860 EFLAGS: 00000246 0000000000000004 RBX: ffff8802119ed860 RCX: 000000000000a299 RDX: ffff88021086c700 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffffffff8100bc8e R08: ffff880210ac229c R09: 0000000000000000 R10: ffff88021278aab8 R11: 0000000000000000 R12: ffff8802119ed860 R13: ffffffff8100be6e R14: 0000000000000001 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 00000000006d4840 CR3: 0000000209aa5000 CR4: 00000000000406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: [] ? ipoib_mcast_send+0x157/0x480 [ib_ipoib] [] ? apic_timer_interrupt+0xe/0x20 [] ? apic_timer_interrupt+0xe/0x20 [] ? ipoib_path_lookup+0x124/0x2d0 [ib_ipoib] [] ? ipoib_start_xmit+0x17c/0x430 [ib_ipoib] [] ? dev_hard_start_xmit+0x2c8/0x3f0 [] ? sch_direct_xmit+0x15a/0x1c0 [] ? dev_queue_xmit+0x388/0x4d0 [] ? ipoib_mcast_join_finish+0x2c7/0x510 [ib_ipoib] [] ? ipoib_mcast_sendonly_join_complete+0x1b8/0x1f0 [ib_ipoib] [] ? mcast_work_handler+0x1a6/0x710 [ib_sa] [] ? ib_send_mad+0xfe/0x3c0 [ib_mad] [] ? ib_get_cached_lmc+0xa3/0xb0 [ib_core] [] ? join_handler+0xeb/0x200 [ib_sa] [] ? ib_sa_mcmember_rec_callback+0x5c/0xa0 [ib_sa] [] ? recv_handler+0x3c/0x70 [ib_sa] [] ? ib_mad_completion_handler+0x844/0x9d0 [ib_mad] [] ? ib_mad_completion_handler+0x0/0x9d0 [ib_mad] [] ? worker_thread+0x170/0x2a0 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x2a0 [] ? kthread+0x96/0xa0 [] ? child_rip+0xa/0x20 Coinciding with stack trace is the following message: ib0: ib_address_create failed The code below in ipoib_mcast_join_finish() will note the above failure in the address handle but otherwise continue: ah = ipoib_create_ah(dev, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { The while loop at the bottom of ipoib_mcast_join_finish() will attempt to send queued multicast packets in mcast->pkt_queue and eventually end up in ipoib_mcast_send(): if (!mcast->ah) { if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } My read is that the code will requeue the packet and return to the ipoib_mcast_join_finish() while loop and the stage is set for the "hung" task diagnostic as the while loop never sees a non-NULL ah, and will do nothing to resolve. There are GFP_ATOMIC allocates in the provider routines, so this is possible and should be dealt with. The test that induced the failure is associated with a host SM on the same server during a shutdown. This patch causes ipoib_mcast_join_finish() to exit with an error which will flush the queued mcast packets. Nothing is done to unwind the QP attached state so that subsequent sends from above will retry the join. Reviewed-by: Ram Vepa Reviewed-by: Gary Leshner Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 ++++++++----- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 7 +++++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 0ef9af94997d..4115be54ba3b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -57,21 +57,24 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { struct ipoib_ah *ah; + struct ib_ah *vah; ah = kmalloc(sizeof *ah, GFP_KERNEL); if (!ah) - return NULL; + return ERR_PTR(-ENOMEM); ah->dev = dev; ah->last_send = 0; kref_init(&ah->ref); - ah->ah = ib_create_ah(pd, attr); - if (IS_ERR(ah->ah)) { + vah = ib_create_ah(pd, attr); + if (IS_ERR(vah)) { kfree(ah); - ah = NULL; - } else + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + } return ah; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7567b6000230..37c46c66b0f2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -432,7 +432,7 @@ static void path_rec_completion(int status, spin_lock_irqsave(&priv->lock, flags); - if (ah) { + if (!IS_ERR_OR_NULL(ah)) { path->pathrec = *pathrec; old_ah = path->ah; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 1b7a97686356..30ca8f069a17 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -240,8 +240,11 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, av.grh.dgid = mcast->mcmember.mgid; ah = ipoib_create_ah(dev, priv->pd, &av); - if (!ah) { - ipoib_warn(priv, "ib_address_create failed\n"); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); } else { spin_lock_irq(&priv->lock); mcast->ah = ah; From 580da35a31f91a594f3090b7a2c39b85cb051a12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Nov 2011 22:31:23 +0100 Subject: [PATCH 4/4] IB: Fix RCU lockdep splats Commit f2c31e32b37 ("net: fix NULL dereferences in check_peer_redir()") forgot to take care of infiniband uses of dst neighbours. Many thanks to Marc Aurele who provided a nice bug report and feedback. Reported-by: Marc Aurele La France Signed-off-by: Eric Dumazet Cc: David Miller Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 9 ++++++--- drivers/infiniband/hw/cxgb3/iwch_cm.c | 4 ++++ drivers/infiniband/hw/cxgb4/cm.c | 6 ++++++ drivers/infiniband/hw/nes/nes_cm.c | 6 ++++-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 18 +++++++++++------- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++++-- 6 files changed, 35 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 691276bafd78..e9cf51b1343b 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -216,7 +216,9 @@ static int addr4_resolve(struct sockaddr_in *src_in, neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->dst.dev); if (!neigh || !(neigh->nud_state & NUD_VALID)) { + rcu_read_lock(); neigh_event_send(dst_get_neighbour(&rt->dst), NULL); + rcu_read_unlock(); ret = -ENODATA; if (neigh) goto release; @@ -274,15 +276,16 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, goto put; } + rcu_read_lock(); neigh = dst_get_neighbour(dst); if (!neigh || !(neigh->nud_state & NUD_VALID)) { if (neigh) neigh_event_send(neigh, NULL); ret = -ENODATA; - goto put; + } else { + ret = rdma_copy_addr(addr, dst->dev, neigh->ha); } - - ret = rdma_copy_addr(addr, dst->dev, neigh->ha); + rcu_read_unlock(); put: dst_release(dst); return ret; diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index de6d0774e609..c88b12beef25 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1375,8 +1375,10 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) goto reject; } dst = &rt->dst; + rcu_read_lock(); neigh = dst_get_neighbour(dst); l2t = t3_l2t_get(tdev, neigh, neigh->dev); + rcu_read_unlock(); if (!l2t) { printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", __func__); @@ -1946,10 +1948,12 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } ep->dst = &rt->dst; + rcu_read_lock(); neigh = dst_get_neighbour(ep->dst); /* get a l2t entry */ ep->l2t = t3_l2t_get(ep->com.tdev, neigh, neigh->dev); + rcu_read_unlock(); if (!ep->l2t) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); err = -ENOMEM; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index b36cdac9c558..75b57bee6622 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1594,6 +1594,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } dst = &rt->dst; + rcu_read_lock(); neigh = dst_get_neighbour(dst); if (neigh->dev->flags & IFF_LOOPBACK) { pdev = ip_dev_find(&init_net, peer_ip); @@ -1620,6 +1621,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) rss_qid = dev->rdev.lldi.rxq_ids[ cxgb4_port_idx(neigh->dev) * step]; } + rcu_read_unlock(); if (!l2t) { printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", __func__); @@ -1820,6 +1822,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep) } ep->dst = &rt->dst; + rcu_read_lock(); neigh = dst_get_neighbour(ep->dst); /* get a l2t entry */ @@ -1856,6 +1859,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep) ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[ cxgb4_port_idx(neigh->dev) * step]; } + rcu_read_unlock(); if (!ep->l2t) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); err = -ENOMEM; @@ -2301,6 +2305,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } ep->dst = &rt->dst; + rcu_read_lock(); neigh = dst_get_neighbour(ep->dst); /* get a l2t entry */ @@ -2339,6 +2344,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->retry_with_mpa_v1 = 0; ep->tried_with_mpa_v1 = 0; } + rcu_read_unlock(); if (!ep->l2t) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); err = -ENOMEM; diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index dfce9ea98a39..0a52d72371ee 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1377,9 +1377,11 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi neigh_release(neigh); } - if ((neigh == NULL) || (!(neigh->nud_state & NUD_VALID))) + if ((neigh == NULL) || (!(neigh->nud_state & NUD_VALID))) { + rcu_read_lock(); neigh_event_send(dst_get_neighbour(&rt->dst), NULL); - + rcu_read_unlock(); + } ip_rt_put(rt); return rc; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7567b6000230..ef38848d1b0e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -555,6 +555,7 @@ static int path_rec_start(struct net_device *dev, return 0; } +/* called with rcu_read_lock */ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -636,6 +637,7 @@ err_drop: spin_unlock_irqrestore(&priv->lock, flags); } +/* called with rcu_read_lock */ static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(skb->dev); @@ -720,13 +722,14 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) struct neighbour *n = NULL; unsigned long flags; + rcu_read_lock(); if (likely(skb_dst(skb))) n = dst_get_neighbour(skb_dst(skb)); if (likely(n)) { if (unlikely(!*to_ipoib_neigh(n))) { ipoib_path_lookup(skb, dev); - return NETDEV_TX_OK; + goto unlock; } neigh = *to_ipoib_neigh(n); @@ -749,17 +752,17 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_neigh_free(dev, neigh); spin_unlock_irqrestore(&priv->lock, flags); ipoib_path_lookup(skb, dev); - return NETDEV_TX_OK; + goto unlock; } if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); - return NETDEV_TX_OK; + goto unlock; } } else if (neigh->ah) { ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha)); - return NETDEV_TX_OK; + goto unlock; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { @@ -793,13 +796,14 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) phdr->hwaddr + 4); dev_kfree_skb_any(skb); ++dev->stats.tx_dropped; - return NETDEV_TX_OK; + goto unlock; } unicast_arp_send(skb, dev, phdr); } } - +unlock: + rcu_read_unlock(); return NETDEV_TX_OK; } @@ -837,7 +841,7 @@ static int ipoib_hard_header(struct sk_buff *skb, dst = skb_dst(skb); n = NULL; if (dst) - n = dst_get_neighbour(dst); + n = dst_get_neighbour_raw(dst); if ((!dst || !n) && daddr) { struct ipoib_pseudoheader *phdr = (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 1b7a97686356..cad1894594a8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -266,7 +266,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, skb->dev = dev; if (dst) - n = dst_get_neighbour(dst); + n = dst_get_neighbour_raw(dst); if (!dst || !n) { /* put pseudoheader back on for next time */ skb_push(skb, sizeof (struct ipoib_pseudoheader)); @@ -722,6 +722,8 @@ out: if (mcast && mcast->ah) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n = NULL; + + rcu_read_lock(); if (dst) n = dst_get_neighbour(dst); if (n && !*to_ipoib_neigh(n)) { @@ -734,7 +736,7 @@ out: list_add_tail(&neigh->list, &mcast->neigh_list); } } - + rcu_read_unlock(); spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); return;