diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ed42c1009685..3fd36b421d51 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -3248,13 +3248,14 @@ static int bnx2x_poll(struct napi_struct *napi, int budget) rmb(); if (!(bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) { - napi_complete(napi); - /* Re-enable interrupts */ - DP(NETIF_MSG_RX_STATUS, - "Update index to %d\n", fp->fp_hc_idx); - bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, - le16_to_cpu(fp->fp_hc_idx), - IGU_INT_ENABLE, 1); + if (napi_complete_done(napi, rx_work_done)) { + /* Re-enable interrupts */ + DP(NETIF_MSG_RX_STATUS, + "Update index to %d\n", fp->fp_hc_idx); + bnx2x_ack_sb(bp, fp->igu_sb_id, USTORM_ID, + le16_to_cpu(fp->fp_hc_idx), + IGU_INT_ENABLE, 1); + } } else { rx_work_done = budget; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 2cc91002064f..22f08f9ef464 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1137,8 +1137,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) done = 0; } /* Done for now */ - napi_complete_done(napi, done); - mlx4_en_arm_cq(priv, cq); + if (napi_complete_done(napi, done)) + mlx4_en_arm_cq(priv, cq); return done; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86bacf6a64f0..bcddf951ccee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -334,6 +334,16 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ +}; + +enum { + NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED), + NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC), + NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED), + NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL), }; enum gro_result { @@ -453,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi) return false; } -void __napi_complete(struct napi_struct *n); -void napi_complete_done(struct napi_struct *n, int work_done); +bool __napi_complete(struct napi_struct *n); +bool napi_complete_done(struct napi_struct *n, int work_done); /** * napi_complete - NAPI processing complete * @n: NAPI context * * Mark NAPI processing as complete. * Consider using napi_complete_done() instead. + * Return false if device should avoid rearming interrupts. */ -static inline void napi_complete(struct napi_struct *n) +static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 2fbeb1313c0f..965e52b9b5a3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void) return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); } -static inline bool sk_can_busy_loop(struct sock *sk) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && - !need_resched() && !signal_pending(current); + return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); } diff --git a/net/core/dev.c b/net/core/dev.c index 6deba68ad9e4..edba9efeb2e9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4898,26 +4898,36 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -void __napi_complete(struct napi_struct *n) +bool __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + /* Some drivers call us directly, instead of calling + * napi_complete_done(). + */ + if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) + return false; + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + return true; } EXPORT_SYMBOL(__napi_complete); -void napi_complete_done(struct napi_struct *n, int work_done) +bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) - return; + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; if (n->gro_list) { unsigned long timeout = 0; @@ -4939,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) __napi_complete(n); local_irq_restore(flags); } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4956,13 +4967,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) + #define BUSY_POLL_BUDGET 8 + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +{ + int rc; + + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, BUSY_POLL_BUDGET); + netpoll_poll_unlock(have_poll_lock); + if (rc == BUSY_POLL_BUDGET) + __napi_schedule(napi); + local_bh_enable(); + if (local_softirq_pending()) + do_softirq(); +} + bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); int (*busy_poll)(struct napi_struct *dev); + void *have_poll_lock = NULL; struct napi_struct *napi; - int rc = false; + int rc; + +restart: + rc = false; + napi_poll = NULL; rcu_read_lock(); @@ -4973,24 +5012,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) /* Note: ndo_busy_poll method is optional in linux-4.5 */ busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - do { + preempt_disable(); + for (;;) { rc = 0; local_bh_disable(); if (busy_poll) { rc = busy_poll(napi); - } else if (napi_schedule_prep(napi)) { - void *have = netpoll_poll_lock(napi); - - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); - if (rc == BUSY_POLL_BUDGET) { - napi_complete_done(napi, rc); - napi_schedule(napi); - } - } - netpoll_poll_unlock(have); + goto count; } + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto count; + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto count; + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + rc = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); +count: if (rc > 0) __NET_ADD_STATS(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); @@ -4999,10 +5047,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); + if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + break; + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rc = !skb_queue_empty(&sk->sk_receive_queue); + if (rc || busy_loop_timeout(end_time)) + return rc; + goto restart; + } + cpu_relax_lowlatency(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock();