From 1853c949646005b5959c483becde86608f548f24 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Sep 2015 20:05:46 +0200 Subject: [PATCH 001/116] netlink, mmap: transform mmap skb into full skb on taps Ken-ichirou reported that running netlink in mmap mode for receive in combination with nlmon will throw a NULL pointer dereference in __kfree_skb() on nlmon_xmit(), in my case I can also trigger an "unable to handle kernel paging request". The problem is the skb_clone() in __netlink_deliver_tap_skb() for skbs that are mmaped. I.e. the cloned skb doesn't have a destructor, whereas the mmap netlink skb has it pointed to netlink_skb_destructor(), set in the handler netlink_ring_setup_skb(). There, skb->head is being set to NULL, so that in such cases, __kfree_skb() doesn't perform a skb_release_data() via skb_release_all(), where skb->head is possibly being freed through kfree(head) into slab allocator, although netlink mmap skb->head points to the mmap buffer. Similarly, the same has to be done also for large netlink skbs where the data area is vmalloced. Therefore, as discussed, make a copy for these rather rare cases for now. This fixes the issue on my and Ken-ichirou's test-cases. Reference: http://thread.gmane.org/gmane.linux.network/371129 Fixes: bcbde0d449ed ("net: netlink: virtual tap device management") Reported-by: Ken-ichirou MATSUZAWA Signed-off-by: Daniel Borkmann Tested-by: Ken-ichirou MATSUZAWA Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 30 +++++++++++++++++++++++------- net/netlink/af_netlink.h | 9 +++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7f86d3b55060..4cad99d6c68b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -125,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) +{ + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; + + new = alloc_skb(len, gfp_mask); + if (new == NULL) + return NULL; + + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + + memcpy(skb_put(new, len), skb->data, len); + return new; +} + int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -206,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, int ret = -ENOMEM; dev_hold(dev); - nskb = skb_clone(skb, GFP_ATOMIC); + + if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); @@ -279,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk) } #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - static bool netlink_rx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -846,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false #define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 89008405d6b4..df9a06090db6 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -59,6 +59,15 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +#else + return false; +#endif /* CONFIG_NETLINK_MMAP */ +} + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; From 19539ce783dd27768d4f7f3b753152bf983db65b Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Thu, 10 Sep 2015 18:25:07 -0600 Subject: [PATCH 002/116] ebpf: emit correct src_reg for conditional jumps Instead of always emitting BPF_REG_X, let's emit BPF_REG_X only when the source actually is BPF_X. This causes programs generated by the classic converter to not be importable via bpf(), as the eBPF verifier checks that the src_reg is correct or 0. While not a problem yet, this will be a problem when BPF_PROG_DUMP lands, and we can potentially dump and re-import programs generated by the converter. Signed-off-by: Tycho Andersen CC: Alexei Starovoitov CC: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 13079f03902e..05a04ea87172 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -478,9 +478,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ From 8e2d61e0aed2b7c4ecb35844fe07e0b2b762dee4 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 10 Sep 2015 17:31:15 -0300 Subject: [PATCH 003/116] sctp: fix race on protocol/netns initialization Consider sctp module is unloaded and is being requested because an user is creating a sctp socket. During initialization, sctp will add the new protocol type and then initialize pernet subsys: status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_net_ops); The problem is that after those calls to sctp_v{4,6}_protosw_init(), it is possible for userspace to create SCTP sockets like if the module is already fully loaded. If that happens, one of the possible effects is that we will have readers for net->sctp.local_addr_list list earlier than expected and sctp_net_init() does not take precautions while dealing with that list, leading to a potential panic but not limited to that, as sctp_sock_init() will copy a bunch of blank/partially initialized values from net->sctp. The race happens like this: CPU 0 | CPU 1 socket() | __sock_create | socket() inet_create | __sock_create list_for_each_entry_rcu( | answer, &inetsw[sock->type], | list) { | inet_create /* no hits */ | if (unlikely(err)) { | ... | request_module() | /* socket creation is blocked | * the module is fully loaded | */ | sctp_init | sctp_v4_protosw_init | inet_register_protosw | list_add_rcu(&p->list, | last_perm); | | list_for_each_entry_rcu( | answer, &inetsw[sock->type], sctp_v6_protosw_init | list) { | /* hit, so assumes protocol | * is already loaded | */ | /* socket creation continues | * before netns is initialized | */ register_pernet_subsys | Simply inverting the initialization order between register_pernet_subsys() and sctp_v4_protosw_init() is not possible because register_pernet_subsys() will create a control sctp socket, so the protocol must be already visible by then. Deferring the socket creation to a work-queue is not good specially because we loose the ability to handle its errors. So, as suggested by Vlad, the fix is to split netns initialization in two moments: defaults and control socket, so that the defaults are already loaded by when we register the protocol, while control socket initialization is kept at the same moment it is today. Fixes: 4db67e808640 ("sctp: Make the address lists per network namespace") Signed-off-by: Vlad Yasevich Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 64 +++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b7143337e4fa..3d9ea9a48289 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1186,7 +1186,7 @@ static void sctp_v4_del_protocol(void) unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net) { int status; @@ -1279,12 +1279,6 @@ static int __net_init sctp_net_init(struct net *net) sctp_dbg_objcnt_init(net); - /* Initialize the control inode/socket for handling OOTB packets. */ - if ((status = sctp_ctl_sock_init(net))) { - pr_err("Failed to initialize the SCTP control sock\n"); - goto err_ctl_sock_init; - } - /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); @@ -1300,9 +1294,6 @@ static int __net_init sctp_net_init(struct net *net) return 0; -err_ctl_sock_init: - sctp_dbg_objcnt_exit(net); - sctp_proc_exit(net); err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: @@ -1311,15 +1302,12 @@ err_sysctl_register: return status; } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - /* Free the control endpoint. */ - inet_ctl_sock_destroy(net->sctp.ctl_sock); - sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); @@ -1327,9 +1315,32 @@ static void __net_exit sctp_net_exit(struct net *net) sctp_sysctl_net_unregister(net); } -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { + .init = sctp_defaults_init, + .exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ + int status; + + /* Initialize the control inode/socket for handling OOTB packets. */ + status = sctp_ctl_sock_init(net); + if (status) + pr_err("Failed to initialize the SCTP control sock\n"); + + return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ + /* Free the control endpoint. */ + inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { + .init = sctp_ctrlsock_init, + .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ @@ -1462,8 +1473,11 @@ static __init int sctp_init(void) sctp_v4_pf_init(); sctp_v6_pf_init(); - status = sctp_v4_protosw_init(); + status = register_pernet_subsys(&sctp_defaults_ops); + if (status) + goto err_register_defaults; + status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; @@ -1471,9 +1485,9 @@ static __init int sctp_init(void) if (status) goto err_v6_protosw_init; - status = register_pernet_subsys(&sctp_net_ops); + status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) - goto err_register_pernet_subsys; + goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) @@ -1489,12 +1503,14 @@ out: err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: - unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: + unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: + unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); @@ -1527,12 +1543,14 @@ static __exit void sctp_exit(void) sctp_v6_del_protocol(); sctp_v4_del_protocol(); - unregister_pernet_subsys(&sctp_net_ops); + unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); + unregister_pernet_subsys(&sctp_defaults_ops); + /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); From a19a19de8310fb8ca2ca0621a9db1aab082943c5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Sep 2015 11:33:01 +0200 Subject: [PATCH 004/116] bnx2x: use ktime_get_seconds() for timestamp commit c48f350ff5e7 "bnx2x: Add MFW dump support" added the bnx2x_update_mfw_dump() function that reads the current time and stores it in a 32-bit field that gets passed into a buffer in a fixed format. This is potentially broken when the epoch overflows in 2038, and otherwise overflows in 2106. As we're trying to avoid uses of struct timeval for this reason, I noticed the addition of this function, and tried to rewrite it in a way that is more explicit about the overflow and that will keep working once we deprecate struct timeval. I assume that it is not possible to change the ABI any more, otherwise we should try to use a 64-bit field for the seconds right away. Signed-off-by: Arnd Bergmann Cc: Yuval Mintz Cc: Ariel Elior Acked-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index e3da2bddf143..89a174fa1300 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -3705,16 +3705,14 @@ out: void bnx2x_update_mfw_dump(struct bnx2x *bp) { - struct timeval epoc; u32 drv_ver; u32 valid_dump; if (!SHMEM2_HAS(bp, drv_info)) return; - /* Update Driver load time */ - do_gettimeofday(&epoc); - SHMEM2_WR(bp, drv_info.epoc, epoc.tv_sec); + /* Update Driver load time, possibly broken in y2038 */ + SHMEM2_WR(bp, drv_info.epoc, (u32)ktime_get_real_seconds()); drv_ver = bnx2x_update_mng_version_utility(DRV_MODULE_VERSION, true); SHMEM2_WR(bp, drv_info.drv_ver, drv_ver); From c2d4fbd2163e607915cc05798ce7fb7f31117cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Fri, 11 Sep 2015 18:39:48 +0200 Subject: [PATCH 005/116] bridge: fix igmpv3 / mldv2 report parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the newly introduced helper functions the skb pulling is hidden in the checksumming function - and undone before returning to the caller. The IGMPv3 and MLDv2 report parsing functions in the bridge still assumed that the skb is pointing to the beginning of the IGMP/MLD message while it is now kept at the beginning of the IPv4/6 header, breaking the message parsing and creating packet loss. Fixing this by taking the offset between IP and IGMP/MLD header into account, too. Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code") Reported-by: Tobias Powalowski Tested-by: Tobias Powalowski Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66efdc21f548..480b3de1a0e3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1006,7 +1006,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); - len = sizeof(*ih); + len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); @@ -1067,7 +1067,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = sizeof(*icmp6h); + len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; From 38c089d1d8d058f5dff018a811568aa8e8bc47fc Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 11 Sep 2015 15:01:16 -0700 Subject: [PATCH 006/116] openvswitch: Fix dependency on IPv6 defrag. When NF_CONNTRACK is built-in, NF_DEFRAG_IPV6 is a module, and OPENVSWITCH is built-in, the following build error would occur: net/built-in.o: In function `ovs_ct_execute': (.text+0x10f587): undefined reference to `nf_ct_frag6_gather' Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Reported-by: Jim Davis Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2a071f470d57..d143aa9f6654 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,7 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET - depends on (!NF_CONNTRACK || NF_CONNTRACK) + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO From e8684c88774c0ddfeefdbed0aa469b25b9962f3e Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 12 Sep 2015 00:34:48 +0300 Subject: [PATCH 007/116] irda: ali-ircc: Fix deadlock in ali_ircc_sir_change_speed() ali_ircc_sir_change_speed() is always called with self->lock held, so acquiring the lock inside it leads to unavoidable deadlock. Call graph: ali_ircc_sir_change_speed() is called from ali_ircc_change_speed() ali_ircc_fir_hard_xmit() under spin_lock_irqsave(&self->lock, flags); ali_ircc_sir_hard_xmit() under spin_lock_irqsave(&self->lock, flags); ali_ircc_net_ioctl() under spin_lock_irqsave(&self->lock, flags); ali_ircc_dma_xmit_complete() ali_ircc_fir_interrupt() ali_ircc_interrupt() under spin_lock(&self->lock); ali_ircc_sir_write_wakeup() ali_ircc_sir_interrupt() ali_ircc_interrupt() under spin_lock(&self->lock); The patch removes spin_lock/unlock from ali_ircc_sir_change_speed(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- drivers/net/irda/ali-ircc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/irda/ali-ircc.c b/drivers/net/irda/ali-ircc.c index 58ae11a14bb6..64bb44d5d867 100644 --- a/drivers/net/irda/ali-ircc.c +++ b/drivers/net/irda/ali-ircc.c @@ -1031,7 +1031,6 @@ static void ali_ircc_fir_change_speed(struct ali_ircc_cb *priv, __u32 baud) static void ali_ircc_sir_change_speed(struct ali_ircc_cb *priv, __u32 speed) { struct ali_ircc_cb *self = priv; - unsigned long flags; int iobase; int fcr; /* FIFO control reg */ int lcr; /* Line control reg */ @@ -1061,8 +1060,6 @@ static void ali_ircc_sir_change_speed(struct ali_ircc_cb *priv, __u32 speed) /* Update accounting for new speed */ self->io.speed = speed; - spin_lock_irqsave(&self->lock, flags); - divisor = 115200/speed; fcr = UART_FCR_ENABLE_FIFO; @@ -1089,9 +1086,6 @@ static void ali_ircc_sir_change_speed(struct ali_ircc_cb *priv, __u32 speed) /* without this, the connection will be broken after come back from FIR speed, but with this, the SIR connection is harder to established */ outb((UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2), iobase+UART_MCR); - - spin_unlock_irqrestore(&self->lock, flags); - } static void ali_ircc_change_dongle_speed(struct ali_ircc_cb *priv, int speed) From 205ee117d4dc4a11ac3bd9638bb9b2e839f4de9a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Sep 2015 02:57:21 +0200 Subject: [PATCH 008/116] netfilter: nf_log: don't zap all loggers on unregister like nf_log_unset, nf_log_unregister must not reset the list of loggers. Otherwise, a call to nf_log_unregister() will render loggers of other nf protocols unusable: iptables -A INPUT -j LOG modprobe nf_log_arp ; rmmod nf_log_arp iptables -A INPUT -j LOG iptables: No chain/target/match by that name Fixes: 30e0c6a6be ("netfilter: nf_log: prepare net namespace support for loggers") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 675d12c69e32..a5ebd7d9c472 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -107,11 +107,15 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unregister); From 63cdbc06b357dcb3a7104a421ee4a4550d7fadfd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Sep 2015 17:06:27 +0200 Subject: [PATCH 009/116] netfilter: bridge: fix routing of bridge frames with call-iptables=1 We can't re-use the physoutdev storage area. 1. When using NFQUEUE in PREROUTING, we attempt to bump a bogus refcnt since nf_bridge->physoutdev is garbage (ipv4/ipv6 address) 2. for same reason, we crash in physdev match in FORWARD or later if skb is routed instead of bridged. This increases nf_bridge_info to 40 bytes, but we have no other choice. Fixes: 72b1e5e4cac7 ("netfilter: bridge: reduce nf_bridge_info to 32 bytes again") Reported-by: Sander Eikelenboom Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2738d355cdf9..9987af080fa0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -179,6 +179,9 @@ struct nf_bridge_info { u8 bridged_dnat:1; __u16 frag_max_size; struct net_device *physindev; + + /* always valid & non-NULL from FORWARD on, for physdev match */ + struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; @@ -189,9 +192,6 @@ struct nf_bridge_info { * skb is out in neigh layer. */ char neigh_header[8]; - - /* always valid & non-NULL from FORWARD on, for physdev match */ - struct net_device *physoutdev; }; }; #endif From ba378ca9c04a5fc1b2cf0f0274a9d02eb3d1bad9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Sep 2015 18:04:09 +0200 Subject: [PATCH 010/116] netfilter: nft_compat: skip family comparison in case of NFPROTO_UNSPEC Fix lookup of existing match/target structures in the corresponding list by skipping the family check if NFPROTO_UNSPEC is used. This is resulting in the allocation and insertion of one match/target structure for each use of them. So this not only bloats memory consumption but also severely affects the time to reload the ruleset from the iptables-compat utility. After this patch, iptables-compat-restore and iptables-compat take almost the same time to reload large rulesets. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 66def315eb56..9c8fab00164b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -619,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -626,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -641,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -693,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -700,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -715,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); From a3c119d392d7d7c68865fe76f5732ca9b8164d68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:05 -0700 Subject: [PATCH 011/116] ipv6: Refactor common ip6gre_tunnel_init codes It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch refactors some common init codes used by both ip6gre_tunnel_init and ip6gre_tap_init. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4038c694ec03..af60d46129c1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1245,7 +1245,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1255,16 +1255,30 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; + + tunnel = netdev_priv(dev); + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1460,19 +1474,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; tunnel = netdev_priv(dev); - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } From f230d1e891ba1da5953460516960894154f265db Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:06 -0700 Subject: [PATCH 012/116] ipv6: Rename the dst_cache helper functions in ip6_tunnel It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch rename: 1. ip6_tnl_dst_check() to ip6_tnl_dst_get() to better reflect that it will take a dst refcnt in the next patch. 2. ip6_tnl_dst_store() to ip6_tnl_dst_set() to have a more conventional name matching with ip6_tnl_dst_get(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 4 ++-- net/ipv6/ip6_gre.c | 4 ++-- net/ipv6/ip6_tunnel.c | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index b8529aa1dae7..979b081a47e8 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -60,9 +60,9 @@ struct ipv6_tlv_tnl_enc_lim { __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t); +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst); +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index af60d46129c1..24f5dd8f76a8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -634,7 +634,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(tunnel); + dst = ip6_tnl_dst_get(tunnel); if (!dst) { ndst = ip6_route_output(net, NULL, fl6); @@ -763,7 +763,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ip6tunnel_xmit(NULL, skb, dev); if (ndst) - ip6_tnl_dst_store(tunnel, ndst); + ip6_tnl_dst_set(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420612bc..599b0b419fbc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,7 +126,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct dst_entry *dst = t->dst_cache; @@ -139,7 +139,7 @@ struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { @@ -148,14 +148,14 @@ void ip6_tnl_dst_reset(struct ip6_tnl *t) } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -1010,7 +1010,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1102,7 +1102,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); if (ndst) - ip6_tnl_dst_store(t, ndst); + ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; From cdf3464e6c6bd764277cbbe992cd12da735b92fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:07 -0700 Subject: [PATCH 013/116] ipv6: Fix dst_entry refcnt bugs in ip6_tunnel Problems in the current dst_entry cache in the ip6_tunnel: 1. ip6_tnl_dst_set is racy. There is no lock to protect it: - One major problem is that the dst refcnt gets messed up. F.e. the same dst_cache can be released multiple times and then triggering the infamous dst refcnt < 0 warning message. - Another issue is the inconsistency between dst_cache and dst_cookie. It can be reproduced by adding and removing the ip6gre tunnel while running a super_netperf TCP_CRR test. 2. ip6_tnl_dst_get does not take the dst refcnt before returning the dst. This patch: 1. Create a percpu dst_entry cache in ip6_tnl 2. Use a spinlock to protect the dst_cache operations 3. ip6_tnl_dst_get always takes the dst refcnt before returning Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 11 +++- net/ipv6/ip6_gre.c | 38 +++++++------ net/ipv6/ip6_tunnel.c | 120 +++++++++++++++++++++++++++++---------- 3 files changed, 122 insertions(+), 47 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 979b081a47e8..60b4f402f78c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -32,6 +32,12 @@ struct __ip6_tnl_parm { __be32 o_key; }; +struct ip6_tnl_dst { + spinlock_t lock; + struct dst_entry *dst; + u32 cookie; +}; + /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -39,8 +45,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct dst_entry *dst_cache; /* cached dst */ - u32 dst_cookie; + struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim { } __packed; struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); +int ip6_tnl_dst_init(struct ip6_tnl *t); +void ip6_tnl_dst_destroy(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 24f5dd8f76a8..646512488c28 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -637,17 +637,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1248,6 +1246,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1259,6 +1258,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 599b0b419fbc..851cf6d1eb45 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,37 +126,90 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ +static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) +{ + dst_release(idst->dst); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; + } + idst->dst = dst; +} + +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) +{ + + spin_lock_bh(&idst->lock); + __ip6_tnl_per_cpu_dst_set(idst, dst); + spin_unlock_bh(&idst->lock); +} + struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { - struct dst_entry *dst = t->dst_cache; + struct ip6_tnl_dst *idst; + struct dst_entry *dst; - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; - dst_release(dst); - return NULL; + idst = raw_cpu_ptr(t->dst_cache); + spin_lock_bh(&idst->lock); + dst = idst->dst; + if (dst) { + if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { + dst_hold(idst->dst); + } else { + __ip6_tnl_per_cpu_dst_set(idst, NULL); + dst = NULL; + } } - + spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt6_get_cookie(rt); - dst_release(t->dst_cache); - t->dst_cache = dst; + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + } EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); +void ip6_tnl_dst_destroy(struct ip6_tnl *t) +{ + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); + /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -271,6 +324,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1016,17 +1072,17 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1128,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1101,14 +1156,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1626,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } From 8e3d5be7368107f0c27a1f8126d79b01a47e9567 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:08 -0700 Subject: [PATCH 014/116] ipv6: Avoid double dst_free It is a prep work to get dst freeing from fib tree undergo a rcu grace period. The following is a common paradigm: if (ip6_del_rt(rt)) dst_free(rt) which means, if rt cannot be deleted from the fib tree, dst_free(rt) now. 1. We don't know the ip6_del_rt(rt) failure is because it was not managed by fib tree (e.g. DST_NOCACHE) or it had already been removed from the fib tree. 2. If rt had been managed by the fib tree, ip6_del_rt(rt) failure means dst_free(rt) has been called already. A second dst_free(rt) is not always obviously safe. The rt may have been destroyed already. 3. If rt is a DST_NOCACHE, dst_free(rt) should not be called. 4. It is a stopper to make dst freeing from fib tree undergo a rcu grace period. This patch is to use a DST_NOCACHE flag to indicate a rt is not managed by the fib tree. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 7 +++---- net/ipv6/ip6_fib.c | 11 +++++++++-- net/ipv6/route.c | 7 ++++--- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 030fefdc9aed..900113376d4e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5127,13 +5127,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); - if (rt && ip6_del_rt(rt)) - dst_free(&rt->dst); + if (rt) + ip6_del_rt(rt); } dst_hold(&ifp->rt->dst); - if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->dst); + ip6_del_rt(ifp->rt); rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 418d9823692b..e68350bf838b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -933,6 +933,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); + if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && + !atomic_read(&rt->dst.__refcnt))) + return -EINVAL; + if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1025,6 +1029,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); + rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1049,7 +1054,8 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); } return err; @@ -1060,7 +1066,8 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); return err; #endif } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 53617d715188..3d3c1b294725 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1322,8 +1322,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } @@ -2028,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -2515,6 +2515,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); From 70da5b5c532f0ec8aa76b4f46158da5f010f34b3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:09 -0700 Subject: [PATCH 015/116] ipv6: Replace spinlock with seqlock and rcu in ip6_tunnel This patch uses a seqlock to ensure consistency between idst->dst and idst->cookie. It also makes dst freeing from fib tree to undergo a rcu grace period. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 4 ++-- net/ipv6/ip6_fib.c | 9 +++++-- net/ipv6/ip6_tunnel.c | 51 +++++++++++++++++++++------------------- 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 60b4f402f78c..65c2a9397b3c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -33,8 +33,8 @@ struct __ip6_tnl_parm { }; struct ip6_tnl_dst { - spinlock_t lock; - struct dst_entry *dst; + seqlock_t lock; + struct dst_entry __rcu *dst; u32 cookie; }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e68350bf838b..8a9ec01f4d01 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - dst_free(&pcpu_rt->dst); + rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - dst_free(&rt->dst); + rt6_rcu_free(rt); } } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 851cf6d1eb45..983f0d20f96d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,45 +126,48 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - dst_release(idst->dst); + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); if (dst) { dst_hold(dst); idst->cookie = rt6_get_cookie((struct rt6_info *)dst); } else { idst->cookie = 0; } - idst->dst = dst; -} - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - - spin_lock_bh(&idst->lock); - __ip6_tnl_per_cpu_dst_set(idst, dst); - spin_unlock_bh(&idst->lock); + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); } struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct ip6_tnl_dst *idst; struct dst_entry *dst; + unsigned int seq; + u32 cookie; idst = raw_cpu_ptr(t->dst_cache); - spin_lock_bh(&idst->lock); - dst = idst->dst; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { - dst_hold(idst->dst); - } else { - __ip6_tnl_per_cpu_dst_set(idst, NULL); - dst = NULL; - } + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); + dst_release(dst); + dst = NULL; } - spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); @@ -204,7 +207,7 @@ int ip6_tnl_dst_init(struct ip6_tnl *t) return -ENOMEM; for_each_possible_cpu(i) - spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); return 0; } From daf158d0d544cec80b7b30deff8cfc59a6e17610 Mon Sep 17 00:00:00 2001 From: Simon Guinot Date: Tue, 15 Sep 2015 22:41:21 +0200 Subject: [PATCH 016/116] net: mvneta: fix DMA buffer unmapping in mvneta_rx() This patch fixes a regression introduced by the commit a84e32894191 ("net: mvneta: fix refilling for Rx DMA buffers"). Due to this commit the newly allocated Rx buffers are DMA-unmapped in place of those passed to the networking stack. Obviously, this causes data corruptions. This patch fixes the issue by ensuring that the right Rx buffers are DMA-unmapped. Reported-by: Oren Laskin Signed-off-by: Simon Guinot Fixes: a84e32894191 ("net: mvneta: fix refilling for Rx DMA buffers") Cc: # v3.8+ Tested-by: Oren Laskin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index fe2299ac4f5c..09ec32e33076 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1479,6 +1479,7 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, struct mvneta_rx_desc *rx_desc = mvneta_rxq_next_desc_get(rxq); struct sk_buff *skb; unsigned char *data; + dma_addr_t phys_addr; u32 rx_status; int rx_bytes, err; @@ -1486,6 +1487,7 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, rx_status = rx_desc->status; rx_bytes = rx_desc->data_size - (ETH_FCS_LEN + MVNETA_MH_SIZE); data = (unsigned char *)rx_desc->buf_cookie; + phys_addr = rx_desc->buf_phys_addr; if (!mvneta_rxq_desc_is_first_last(rx_status) || (rx_status & MVNETA_RXD_ERR_SUMMARY)) { @@ -1534,7 +1536,7 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, if (!skb) goto err_drop_frame; - dma_unmap_single(dev->dev.parent, rx_desc->buf_phys_addr, + dma_unmap_single(dev->dev.parent, phys_addr, MVNETA_RX_BUF_SIZE(pp->pkt_size), DMA_FROM_DEVICE); rcvd_pkts++; From d64f69b0373a7d0bcec8b5da7712977518a8f42b Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 15 Sep 2015 14:44:29 -0700 Subject: [PATCH 017/116] rtnetlink: catch -EOPNOTSUPP errors from ndo_bridge_getlink problem reported: kernel 4.1.3 ------------ # bridge vlan port vlan ids eth0 1 PVID Egress Untagged 90 91 92 93 94 95 96 97 98 99 100 vmbr0 1 PVID Egress Untagged 94 kernel 4.2 ----------- # bridge vlan port vlan ids ndo_bridge_getlink can return -EOPNOTSUPP when an interfaces ndo_bridge_getlink op is set to switchdev_port_bridge_getlink and CONFIG_SWITCHDEV is not defined. This today can happen to bond, rocker and team devices. This patch adds -EOPNOTSUPP checks after calls to ndo_bridge_getlink. Fixes: 85fdb956726ff2a ("switchdev: cut over to new switchdev_port_bridge_getlink") Reported-by: Alexandre DERUMIER Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a466821d1441..0ec48403ed68 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3047,6 +3047,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -3067,20 +3068,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } From 892aa01df2ad67237f213c8f9d9b491e908aa910 Mon Sep 17 00:00:00 2001 From: Sjoerd Simons Date: Fri, 11 Sep 2015 22:25:48 +0200 Subject: [PATCH 018/116] net: stmmac: Use msleep rather then udelay for reset delay The reset delays used for stmmac are in the order of 10ms to 1 second, which is far too long for udelay usage, so switch to using msleep. Practically this fixes the PHY not being reliably detected in some cases as udelay wouldn't actually delay for long enough to let the phy reliably be reset. Signed-off-by: Sjoerd Simons Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index b735fa22ac95..ebf6abc4853f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -161,11 +161,16 @@ int stmmac_mdio_reset(struct mii_bus *bus) if (!gpio_request(reset_gpio, "mdio-reset")) { gpio_direction_output(reset_gpio, active_low ? 1 : 0); - udelay(data->delays[0]); + if (data->delays[0]) + msleep(DIV_ROUND_UP(data->delays[0], 1000)); + gpio_set_value(reset_gpio, active_low ? 0 : 1); - udelay(data->delays[1]); + if (data->delays[1]) + msleep(DIV_ROUND_UP(data->delays[1], 1000)); + gpio_set_value(reset_gpio, active_low ? 1 : 0); - udelay(data->delays[2]); + if (data->delays[2]) + msleep(DIV_ROUND_UP(data->delays[2], 1000)); } } #endif From 982b527004826b40de1e821b123c70f05b41496c Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 11 Sep 2015 18:38:28 -0700 Subject: [PATCH 019/116] openvswitch: Fix mask generation for nested attributes. Masks were added to OVS flows in a way that was backwards compatible with userspace programs that did not generate masks. As a result, it is possible that we may receive flows that do not have a mask and we need to synthesize one. Generating a mask requires iterating over attributes and descending into nested attributes. For each level we need to know the size to generate the correct mask. We do this with a linked table of attribute types. Although the logic to handle these nested attributes was there in concept, there are a number of bugs in practice. Examples include incomplete links between tables, variable length attributes being treated as nested and missing sanity checks. Signed-off-by: Jesse Gross Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 82 ++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92d6a262bc5..5c030a4d7338 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -304,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -315,8 +320,9 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -349,6 +355,13 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_CT_LABEL] = { .len = sizeof(struct ovs_key_ct_label) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -388,7 +401,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -473,29 +486,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; struct vxlan_metadata opts; - int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -528,8 +562,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -1052,10 +1086,13 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } } } @@ -1922,8 +1959,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) From 58d29e3ce903dcafacee9e355225d64922325cf0 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:03 +0200 Subject: [PATCH 020/116] atm: he: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- drivers/atm/he.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/atm/he.c b/drivers/atm/he.c index a8da3a50e374..0f5cb37636bc 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1578,9 +1578,7 @@ he_stop(struct he_dev *he_dev) kfree(he_dev->rbpl_virt); kfree(he_dev->rbpl_table); - - if (he_dev->rbpl_pool) - dma_pool_destroy(he_dev->rbpl_pool); + dma_pool_destroy(he_dev->rbpl_pool); if (he_dev->rbrq_base) dma_free_coherent(&he_dev->pci_dev->dev, CONFIG_RBRQ_SIZE * sizeof(struct he_rbrq), @@ -1594,8 +1592,7 @@ he_stop(struct he_dev *he_dev) dma_free_coherent(&he_dev->pci_dev->dev, CONFIG_TBRQ_SIZE * sizeof(struct he_tbrq), he_dev->tpdrq_base, he_dev->tpdrq_phys); - if (he_dev->tpd_pool) - dma_pool_destroy(he_dev->tpd_pool); + dma_pool_destroy(he_dev->tpd_pool); if (he_dev->pci_dev) { pci_read_config_word(he_dev->pci_dev, PCI_COMMAND, &command); From adf78edac09f9640cd9676571586c4be46fb527c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:18 +0200 Subject: [PATCH 021/116] net: core: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) { \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); x = NULL; -} // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/core/sock.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index ca2984afe16e..3307c02244d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2740,10 +2740,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2828,10 +2826,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); From 20471ed4d403a5f4de6aa0c10cd1e446f7f2b3c7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:27 +0200 Subject: [PATCH 022/116] dccp: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); @@ expression x; @@ -if (x != NULL) { \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); x = NULL; -} // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/dccp/ackvec.c | 12 ++++-------- net/dccp/ccid.c | 3 +-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bd9e718c2a20..3de0d0362d7f 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err: void dccp_ackvec_exit(void) { - if (dccp_ackvec_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - } - if (dccp_ackvec_record_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; - } + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; } diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 83498975165f..90f77d08cc37 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f static void ccid_kmem_cache_destroy(struct kmem_cache *slab) { - if (slab != NULL) - kmem_cache_destroy(slab); + kmem_cache_destroy(slab); } static int __init ccid_activate(struct ccid_operations *ccid_ops) From d8949aad3eab5d396f4fefcd581773bf07b9a79e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 4 Sep 2015 12:22:46 +0300 Subject: [PATCH 023/116] Bluetooth: Delay check for conn->smp in smp_conn_security() There are several actions that smp_conn_security() might make that do not require a valid SMP context (conn->smp pointer). One of these actions is to encrypt the link with an existing LTK. If the SMP context wasn't initialized properly we should still allow the independent actions to be done, i.e. the check for the context should only be done at the last possible moment. Reported-by: Chuck Ebbert Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 4.0+ --- net/bluetooth/smp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ad82324f710f..0510a577a7b5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2311,12 +2311,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; - chan = conn->smp; - if (!chan) { - BT_ERR("SMP security requested but not available"); - return 1; - } - if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; @@ -2330,6 +2324,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; + chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } + l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ From ad5001cc7cdf9aaee5eb213fdee657e4a3c94776 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Sep 2015 13:37:00 +0200 Subject: [PATCH 024/116] netfilter: nf_log: wait for rcu grace after logger unregistration The nf_log_unregister() function needs to call synchronize_rcu() to make sure that the objects are not dereferenced anymore on module removal. Fixes: 5962815a6a56 ("netfilter: nf_log: use an array of loggers instead of list") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a5ebd7d9c472..a5d41dfa9f05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -117,6 +117,7 @@ void nf_log_unregister(struct nf_logger *logger) RCU_INIT_POINTER(loggers[i][logger->type], NULL); } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); From 37a1d3611c126fd9782ce5235791f898f053e763 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sun, 13 Sep 2015 10:18:33 -0700 Subject: [PATCH 025/116] ipv6: include NLM_F_REPLACE in route replace notifications This patch adds NLM_F_REPLACE flag to ipv6 route replace notifications. This makes nlm_flags in ipv6 replace notifications consistent with ipv4. Signed-off-by: Roopa Prabhu Acked-by: Nicolas Dichtel Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- net/ipv6/ip6_fib.c | 6 +++--- net/ipv6/route.c | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 063d30474cf6..aaf9700fc9e5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -275,7 +275,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc); int fib6_del(struct rt6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info); +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8a9ec01f4d01..7d2e0023c72d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -851,7 +851,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -877,7 +877,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -1422,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3d3c1b294725..d5fa50297f80 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3304,7 +3304,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3319,7 +3320,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); From cc5706056baa3002b844ff240a1cc2199a978795 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 14 Sep 2015 11:14:50 -0700 Subject: [PATCH 026/116] openvswitch: Fix IPv6 exthdr handling with ct helpers. Static code analysis reveals the following bug: net/openvswitch/conntrack.c:281 ovs_ct_helper() warn: unsigned 'protoff' is never less than zero. This signedness bug breaks error handling for IPv6 extension headers when using conntrack helpers. Fix the error by using a local signed variable. Fixes: cae3a2627520: "openvswitch: Allow attaching helpers to ct action" Reported-by: Dan Carpenter Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e8e524ad8a01..002a755fa07e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -275,13 +275,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; + int ofs; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } + protoff = ofs; break; } default: From 562d897d15a6e2bab3cc9b4c172286b612834fe8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Sep 2015 10:50:14 -0600 Subject: [PATCH 027/116] net: Add documentation for VRF device Signed-off-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/vrf.txt | 96 ++++++++++++++++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 97 insertions(+) create mode 100644 Documentation/networking/vrf.txt diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt new file mode 100644 index 000000000000..031ef4a63485 --- /dev/null +++ b/Documentation/networking/vrf.txt @@ -0,0 +1,96 @@ +Virtual Routing and Forwarding (VRF) +==================================== +The VRF device combined with ip rules provides the ability to create virtual +routing and forwarding domains (aka VRFs, VRF-lite to be specific) in the +Linux network stack. One use case is the multi-tenancy problem where each +tenant has their own unique routing tables and in the very least need +different default gateways. + +Processes can be "VRF aware" by binding a socket to the VRF device. Packets +through the socket then use the routing table associated with the VRF +device. An important feature of the VRF device implementation is that it +impacts only Layer 3 and above so L2 tools (e.g., LLDP) are not affected +(ie., they do not need to be run in each VRF). The design also allows +the use of higher priority ip rules (Policy Based Routing, PBR) to take +precedence over the VRF device rules directing specific traffic as desired. + +In addition, VRF devices allow VRFs to be nested within namespaces. For +example network namespaces provide separation of network interfaces at L1 +(Layer 1 separation), VLANs on the interfaces within a namespace provide +L2 separation and then VRF devices provide L3 separation. + +Design +------ +A VRF device is created with an associated route table. Network interfaces +are then enslaved to a VRF device: + + +-----------------------------+ + | vrf-blue | ===> route table 10 + +-----------------------------+ + | | | + +------+ +------+ +-------------+ + | eth1 | | eth2 | ... | bond1 | + +------+ +------+ +-------------+ + | | + +------+ +------+ + | eth8 | | eth9 | + +------+ +------+ + +Packets received on an enslaved device and are switched to the VRF device +using an rx_handler which gives the impression that packets flow through +the VRF device. Similarly on egress routing rules are used to send packets +to the VRF device driver before getting sent out the actual interface. This +allows tcpdump on a VRF device to capture all packets into and out of the +VRF as a whole.[1] Similiarly, netfilter [2] and tc rules can be applied +using the VRF device to specify rules that apply to the VRF domain as a whole. + +[1] Packets in the forwarded state do not flow through the device, so those + packets are not seen by tcpdump. Will revisit this limitation in a + future release. + +[2] Iptables on ingress is limited to NF_INET_PRE_ROUTING only with skb->dev + set to real ingress device and egress is limited to NF_INET_POST_ROUTING. + Will revisit this limitation in a future release. + + +Setup +----- +1. VRF device is created with an association to a FIB table. + e.g, ip link add vrf-blue type vrf table 10 + ip link set dev vrf-blue up + +2. Rules are added that send lookups to the associated FIB table when the + iif or oif is the VRF device. e.g., + ip ru add oif vrf-blue table 10 + ip ru add iif vrf-blue table 10 + + Set the default route for the table (and hence default route for the VRF). + e.g, ip route add table 10 prohibit default + +3. Enslave L3 interfaces to a VRF device. + e.g, ip link set dev eth1 master vrf-blue + + Local and connected routes for enslaved devices are automatically moved to + the table associated with VRF device. Any additional routes depending on + the enslaved device will need to be reinserted following the enslavement. + +4. Additional VRF routes are added to associated table. + e.g., ip route add table 10 ... + + +Applications +------------ +Applications that are to work within a VRF need to bind their socket to the +VRF device: + + setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, dev, strlen(dev)+1); + +or to specify the output device using cmsg and IP_PKTINFO. + + +Limitations +----------- +VRF device currently only works for IPv4. Support for IPv6 is under development. + +Index of original ingress interface is not available via cmsg. Will address +soon. diff --git a/MAINTAINERS b/MAINTAINERS index 310da4295c70..d4d9d4f6d271 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11243,6 +11243,7 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/vrf.c F: include/net/vrf.h +F: Documentation/networking/vrf.txt VT1211 HARDWARE MONITOR DRIVER M: Juerg Haefliger From 2e64126bb0fb581b55e52e51ec451ebb0d6769c8 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 15 Sep 2015 10:33:07 +0200 Subject: [PATCH 028/116] net: qdisc: enhance default_qdisc documentation Aside from some lingual cleanup, point out which interfaces are not or partly covered by this setting. Signed-off-by: Phil Sutter Acked-by: Cong Wang Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 6294b5186ae5..809ab6efcc74 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -54,13 +54,15 @@ default_qdisc -------------- The default queuing discipline to use for network devices. This allows -overriding the default queue discipline of pfifo_fast with an -alternative. Since the default queuing discipline is created with the -no additional parameters so is best suited to queuing disciplines that -work well without configuration like stochastic fair queue (sfq), -CoDel (codel) or fair queue CoDel (fq_codel). Don't use queuing disciplines -like Hierarchical Token Bucket or Deficit Round Robin which require setting -up classes and bandwidths. +overriding the default of pfifo_fast with an alternative. Since the default +queuing discipline is created without additional parameters so is best suited +to queuing disciplines that work well without configuration like stochastic +fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use +queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin +which require setting up classes and bandwidths. Note that physical multiqueue +interfaces still use mq as root qdisc, which in turn uses this default for its +leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead +default to noqueue. Default: pfifo_fast busy_read From d828755eae637c6ca39e30702e98abaf34cac146 Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Tue, 15 Sep 2015 17:20:09 +0530 Subject: [PATCH 029/116] cxgb4: add device ID for few T5 adapters Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 8353a6cbfcc2..03ed00c49823 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -157,6 +157,11 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x5090), /* Custom T540-CR */ CH_PCI_ID_TABLE_FENTRY(0x5091), /* Custom T522-CR */ CH_PCI_ID_TABLE_FENTRY(0x5092), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5093), /* Custom T580-LP-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5094), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5095), /* Custom T540-CR-SO */ + CH_PCI_ID_TABLE_FENTRY(0x5096), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5097), /* Custom T520-KR */ /* T6 adapters: */ From 58189ca7b27411c3dc9a5cb9eeee0906da684c59 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Sep 2015 15:10:50 -0700 Subject: [PATCH 030/116] net: Fix vti use case with oif in dst lookups Steffen reported that the recent change to add oif to dst lookups breaks the VTI use case. The problem is that with the oif set in the flow struct the comparison to the nh_oif is triggered. Fix by splitting the FLOWI_FLAG_VRFSRC into 2 flags -- one that triggers the vrf device cache bypass (FLOWI_FLAG_VRFSRC) and another telling the lookup to not compare nh oif (FLOWI_FLAG_SKIP_NH_OIF). Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Signed-off-by: David Ahern Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- drivers/net/vrf.c | 3 ++- include/net/flow.h | 1 + include/net/route.h | 2 +- net/ipv4/fib_trie.c | 2 +- net/ipv4/udp.c | 3 ++- net/ipv4/xfrm4_policy.c | 2 ++ 6 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e7094fbd7568..488c6f50df73 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -193,7 +193,8 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_oif = vrf_dev->ifindex, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC, + .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF, .daddr = ip4h->daddr, }; diff --git a/include/net/flow.h b/include/net/flow.h index acd6a096250e..9b85db85f13c 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -35,6 +35,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_VRFSRC 0x04 +#define FLOWI_FLAG_SKIP_NH_OIF 0x08 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/route.h b/include/net/route.h index cc61cb95f059..f46af256880c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -255,7 +255,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; if (netif_index_is_vrf(sock_net(sk), oif)) - flow_flags |= FLOWI_FLAG_VRFSRC; + flow_flags |= FLOWI_FLAG_VRFSRC | FLOWI_FLAG_SKIP_NH_OIF; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb6d23c..6c2af797f2f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..f7d1d5e19e95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_vrf(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..c10a9ee68433 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; From 4671fc6d47e0a0108fe24a4d830347d6a6ef4aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Sep 2015 18:29:47 -0700 Subject: [PATCH 031/116] net/mlx4_en: really allow to change RSS key When changing rss key, we do not want to overwrite user provided key by the one provided by netdev_rss_key_fill(), which is the host random key generated at boot time. Fixes: 947cbb0ac242 ("net/mlx4_en: Support for configurable RSS hash function") Signed-off-by: Eric Dumazet Cc: Eyal Perry CC: Amir Vadai Acked-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 4402a1e48c9b..0ce6ffe73ca8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1268,8 +1268,6 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) rss_context->hash_fn = MLX4_RSS_HASH_TOP; memcpy(rss_context->rss_key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE); - netdev_rss_key_fill(rss_context->rss_key, - MLX4_EN_RSS_KEY_SIZE); } else { en_err(priv, "Unknown RSS hash function requested\n"); err = -EINVAL; From 88c796640eac36209efabe5e42b7b47dee58603e Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 16 Sep 2015 11:11:22 +0200 Subject: [PATCH 032/116] net: ks8851: Export OF module alias information Drivers needs to export the OF id table and this be built into the module or udev won't have the necessary information to autoload the driver module when the device is registered via OF. Signed-off-by: Javier Martinez Canillas Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 66d4ab703f45..60f43ec22175 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1601,6 +1601,7 @@ static const struct of_device_id ks8851_match_table[] = { { .compatible = "micrel,ks8851" }, { } }; +MODULE_DEVICE_TABLE(of, ks8851_match_table); static struct spi_driver ks8851_driver = { .driver = { From ce816eb064c82ab96276969971a561db78e66164 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 16 Sep 2015 12:35:00 +0100 Subject: [PATCH 033/116] solos-pci: Increase headroom on received packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A comment in include/linux/skbuff.h says that: * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. This was demonstrated by a panic when handling fragmented IPv6 packets: http://marc.info/?l=linux-netdev&m=144236093519172&w=2 It's not entirely clear if that comment is still valid — and if it is, perhaps netif_rx() ought to be enforcing it with a warning. But either way, it is rather stupid from a performance point of view for us to be receiving packets into a buffer which doesn't have enough room to prepend an Ethernet header — it means that *every* incoming packet is going to be need to be reallocated. So let's fix that. Signed-off-by: David Woodhouse Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/atm/solos-pci.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index 74e18b0a6d89..3d7fb6516f74 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -805,7 +805,12 @@ static void solos_bh(unsigned long card_arg) continue; } - skb = alloc_skb(size + 1, GFP_ATOMIC); + /* Use netdev_alloc_skb() because it adds NET_SKB_PAD of + * headroom, and ensures we can route packets back out an + * Ethernet interface (for example) without having to + * reallocate. Adding NET_IP_ALIGN also ensures that both + * PPPoATM and PPPoEoBR2684 packets end up aligned. */ + skb = netdev_alloc_skb_ip_align(NULL, size + 1); if (!skb) { if (net_ratelimit()) dev_warn(&card->dev->dev, "Failed to allocate sk_buff for RX\n"); @@ -869,7 +874,10 @@ static void solos_bh(unsigned long card_arg) /* Allocate RX skbs for any ports which need them */ if (card->using_dma && card->atmdev[port] && !card->rx_skb[port]) { - struct sk_buff *skb = alloc_skb(RX_DMA_SIZE, GFP_ATOMIC); + /* Unlike the MMIO case (qv) we can't add NET_IP_ALIGN + * here; the FPGA can only DMA to addresses which are + * aligned to 4 bytes. */ + struct sk_buff *skb = dev_alloc_skb(RX_DMA_SIZE); if (skb) { SKB_CB(skb)->dma_addr = dma_map_single(&card->dev->dev, skb->data, From 1d325d217c7f190a42fb620ead20bb240fc16af0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 16 Sep 2015 17:26:14 +0200 Subject: [PATCH 034/116] ipv6: ip6_fragment: fix headroom tests and skb leak David Woodhouse reports skb_under_panic when we try to push ethernet header to fragmented ipv6 skbs: skbuff: skb_under_panic: text:c1277f1e len:1294 put:14 head:dec98000 data:dec97ffc tail:0xdec9850a end:0xdec98f40 dev:br-lan [..] ip6_finish_output2+0x196/0x4da David further debugged this: [..] offending fragments were arriving here with skb_headroom(skb)==10. Which is reasonable, being the Solos ADSL card's header of 8 bytes followed by 2 bytes of PPP frame type. The problem is that if netfilter ipv6 defragmentation is used, skb_cow() in ip6_forward will only see reassembled skb. Therefore, headroom is overestimated by 8 bytes (we pulled fragment header) and we don't check the skbs in the frag_list either. We can't do these checks in netfilter defrag since outdev isn't known yet. Furthermore, existing tests in ip6_fragment did not consider the fragment or ipv6 header size when checking headroom of the fraglist skbs. While at it, also fix a skb leak on memory allocation -- ip6_fragment must consume the skb. I tested this e1000 driver hacked to not allocate additional headroom (we end up in slowpath, since LL_RESERVED_SPACE is 16). If 2 bytes of headroom are allocated, fastpath is taken (14 byte ethernet header was pulled, so 16 byte headroom available in all fragments). Reported-by: David Woodhouse Diagnosed-by: David Woodhouse Signed-off-by: Florian Westphal Tested-by: David Woodhouse Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea47930740..92b1aa38f121 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -586,20 +586,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -616,8 +618,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +625,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -723,7 +726,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* From 34f5b0066435ffb793049b84fafd29fa195bcf90 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 16 Sep 2015 15:30:21 -0400 Subject: [PATCH 035/116] atm: deal with setting entry before mkip was called If we didn't call ATMARP_MKIP before ATMARP_ENCAP the VCC descriptor is non-existant and we'll end up dereferencing a NULL ptr: [1033173.491930] kasan: GPF could be caused by NULL-ptr deref or user memory accessirq event stamp: 123386 [1033173.493678] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [1033173.493689] Modules linked in: [1033173.493697] CPU: 9 PID: 23815 Comm: trinity-c64 Not tainted 4.2.0-next-20150911-sasha-00043-g353d875-dirty #2545 [1033173.493706] task: ffff8800630c4000 ti: ffff880063110000 task.ti: ffff880063110000 [1033173.493823] RIP: clip_ioctl (net/atm/clip.c:320 net/atm/clip.c:689) [1033173.493826] RSP: 0018:ffff880063117a88 EFLAGS: 00010203 [1033173.493828] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 000000000000000c [1033173.493830] RDX: 0000000000000002 RSI: ffffffffb3f10720 RDI: 0000000000000014 [1033173.493832] RBP: ffff880063117b80 R08: ffff88047574d9a4 R09: 0000000000000000 [1033173.493834] R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1000c622f53 [1033173.493836] R13: ffff8800cb905500 R14: ffff8808d6da2000 R15: 00000000fffffdfd [1033173.493840] FS: 00007fa56b92d700(0000) GS:ffff880478000000(0000) knlGS:0000000000000000 [1033173.493843] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [1033173.493845] CR2: 0000000000000000 CR3: 00000000630e8000 CR4: 00000000000006a0 [1033173.493855] Stack: [1033173.493862] ffffffffb0b60444 000000000000eaea 0000000041b58ab3 ffffffffb3c3ce32 [1033173.493867] ffffffffb0b6f3e0 ffffffffb0b60444 ffffffffb5ea2e50 1ffff1000c622f5e [1033173.493873] ffff8800630c4cd8 00000000000ee09a ffffffffb3ec4888 ffffffffb5ea2de8 [1033173.493874] Call Trace: [1033173.494108] do_vcc_ioctl (net/atm/ioctl.c:170) [1033173.494113] vcc_ioctl (net/atm/ioctl.c:189) [1033173.494116] svc_ioctl (net/atm/svc.c:605) [1033173.494200] sock_do_ioctl (net/socket.c:874) [1033173.494204] sock_ioctl (net/socket.c:958) [1033173.494244] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [1033173.494290] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [1033173.494295] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:186) [1033173.494362] Code: fa 48 c1 ea 03 80 3c 02 00 0f 85 50 09 00 00 49 8b 9e 60 06 00 00 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 14 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 14 09 00 All code ======== 0: fa cli 1: 48 c1 ea 03 shr $0x3,%rdx 5: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) 9: 0f 85 50 09 00 00 jne 0x95f f: 49 8b 9e 60 06 00 00 mov 0x660(%r14),%rbx 16: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1d: fc ff df 20: 48 8d 7b 14 lea 0x14(%rbx),%rdi 24: 48 89 fa mov %rdi,%rdx 27: 48 c1 ea 03 shr $0x3,%rdx 2b:* 0f b6 04 02 movzbl (%rdx,%rax,1),%eax <-- trapping instruction 2f: 48 89 fa mov %rdi,%rdx 32: 83 e2 07 and $0x7,%edx 35: 38 d0 cmp %dl,%al 37: 7f 08 jg 0x41 39: 84 c0 test %al,%al 3b: 0f 85 14 09 00 00 jne 0x955 Code starting with the faulting instruction =========================================== 0: 0f b6 04 02 movzbl (%rdx,%rax,1),%eax 4: 48 89 fa mov %rdi,%rdx 7: 83 e2 07 and $0x7,%edx a: 38 d0 cmp %dl,%al c: 7f 08 jg 0x16 e: 84 c0 test %al,%al 10: 0f 85 14 09 00 00 jne 0x92a [1033173.494366] RIP clip_ioctl (net/atm/clip.c:320 net/atm/clip.c:689) [1033173.494368] RSP Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/atm/clip.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55dfecbe2..e07f551a863c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } From 980137a20317055451a73547cf07be4ddea039ee Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Thu, 17 Sep 2015 15:18:34 +0200 Subject: [PATCH 036/116] ARCNET: fix hard_header_len limit For arcnet the bare minimum header only contains the 4 bytes to specify source, dest and offset (1, 1 and 2 bytes respectively). The corresponding struct is struct arc_hardware. The struct archdr contains additionally a union of possible soft headers. When doing $insertusecasehere packets might well include short (or even no?) soft headers. For this reason only use arc_hardware instead of archdr to determine the hard_header_len for an arcnet device. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/arcnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 10f71c732b59..816d0e94961c 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -326,7 +326,7 @@ static void arcdev_setup(struct net_device *dev) dev->type = ARPHRD_ARCNET; dev->netdev_ops = &arcnet_netdev_ops; dev->header_ops = &arcnet_header_ops; - dev->hard_header_len = sizeof(struct archdr); + dev->hard_header_len = sizeof(struct arc_hardware); dev->mtu = choose_mtu(); dev->addr_len = ARCNET_ALEN; From c38f6ac74c99801360705d97244ff222ae18dc97 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Thu, 17 Sep 2015 15:26:16 +0200 Subject: [PATCH 037/116] MAINTAINERS: add arcnet and take maintainership Add entry for arcnet to MAINTAINERS file and add myself as the maintainer of the subsystem. Signed-off-by: Michael Grzeschik Cc: davem@davemloft.net Cc: joe@perches.com Signed-off-by: David S. Miller --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d4d9d4f6d271..c6aca87cd48a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -808,6 +808,13 @@ S: Maintained F: drivers/video/fbdev/arcfb.c F: drivers/video/fbdev/core/fb_defio.c +ARCNET NETWORK LAYER +M: Michael Grzeschik +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/arcnet/ +F: include/uapi/linux/if_arcnet.h + ARM MFM AND FLOPPY DRIVERS M: Ian Molton S: Maintained From 9dc2ad1008c9f91f55ec6c89ec0f8639dfc91596 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:11:10 +0200 Subject: [PATCH 038/116] vxlan: set needed headroom correctly vxlan_setup is called when allocating the net_device, i.e. way before vxlan_newlink (or vxlan_dev_configure) is called. This means vxlan->default_dst is actually unset in vxlan_setup and the condition that sets needed_headroom always takes the else branch. Set the needed_headrom at the point when we have the information about the address family available. Fixes: e4c7ed415387c ("vxlan: add ipv6 support") Fixes: 2853af6a2ea1a ("vxlan: use dev->needed_headroom instead of dev->hard_header_len") CC: Cong Wang Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index cf8b7f0473b3..6ebe562af04e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2392,10 +2392,6 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) - dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; - else - dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; @@ -2670,8 +2666,12 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, dev->needed_headroom = lowerdev->hard_header_len + (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - } else if (use_ipv6) + } else if (use_ipv6) { vxlan->flags |= VXLAN_F_IPV6; + dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; + } else { + dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; + } memcpy(&vxlan->cfg, conf, sizeof(*conf)); if (!vxlan->cfg.dst_port) From 057ba29bbe85e9587635e3128b26fa30fe849af9 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:11:11 +0200 Subject: [PATCH 039/116] vxlan: reject IPv6 addresses if IPv6 is not configured When IPv6 address is set without IPv6 configured, the vxlan socket is mostly treated as an IPv4 one but various lookus in fdb etc. still take the AF_INET6 into account. This creates incosistencies with weird consequences. Just reject IPv6 addresses in such case. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 6ebe562af04e..bbac1d35ed4e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2636,8 +2636,11 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, dst->remote_ip.sa.sa_family = AF_INET; if (dst->remote_ip.sa.sa_family == AF_INET6 || - vxlan->cfg.saddr.sa.sa_family == AF_INET6) + vxlan->cfg.saddr.sa.sa_family == AF_INET6) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; use_ipv6 = true; + } if (conf->remote_ifindex) { struct net_device *lowerdev From 378fddc281072a10b621341b9f78c902303a8fe7 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:11:12 +0200 Subject: [PATCH 040/116] qlcnic: track vxlan port count The callback for adding vxlan port can be called with the same port for both IPv4 and IPv6. Do not disable the offloading when the same port for both protocols is added and later one of them removed. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic.h | 1 + .../net/ethernet/qlogic/qlcnic/qlcnic_main.c | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h index 06bcc734fe8d..d6696cfa11d2 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h @@ -536,6 +536,7 @@ struct qlcnic_hardware_context { u8 extend_lb_time; u8 phys_port_id[ETH_ALEN]; u8 lb_mode; + u8 vxlan_port_count; u16 vxlan_port; struct device *hwmon_dev; u32 post_mode; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 8b08b20e8b30..d4481454b5f8 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -483,11 +483,17 @@ static void qlcnic_add_vxlan_port(struct net_device *netdev, /* Adapter supports only one VXLAN port. Use very first port * for enabling offload */ - if (!qlcnic_encap_rx_offload(adapter) || ahw->vxlan_port) + if (!qlcnic_encap_rx_offload(adapter)) return; + if (!ahw->vxlan_port_count) { + ahw->vxlan_port_count = 1; + ahw->vxlan_port = ntohs(port); + adapter->flags |= QLCNIC_ADD_VXLAN_PORT; + return; + } + if (ahw->vxlan_port == ntohs(port)) + ahw->vxlan_port_count++; - ahw->vxlan_port = ntohs(port); - adapter->flags |= QLCNIC_ADD_VXLAN_PORT; } static void qlcnic_del_vxlan_port(struct net_device *netdev, @@ -496,11 +502,13 @@ static void qlcnic_del_vxlan_port(struct net_device *netdev, struct qlcnic_adapter *adapter = netdev_priv(netdev); struct qlcnic_hardware_context *ahw = adapter->ahw; - if (!qlcnic_encap_rx_offload(adapter) || !ahw->vxlan_port || + if (!qlcnic_encap_rx_offload(adapter) || !ahw->vxlan_port_count || (ahw->vxlan_port != ntohs(port))) return; - adapter->flags |= QLCNIC_DEL_VXLAN_PORT; + ahw->vxlan_port_count--; + if (!ahw->vxlan_port_count) + adapter->flags |= QLCNIC_DEL_VXLAN_PORT; } static netdev_features_t qlcnic_features_check(struct sk_buff *skb, From 1e5b311ab2cc3baf73cdff066ab39c7466bf166b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:11:13 +0200 Subject: [PATCH 041/116] be2net: allow offloading with the same port for IPv4 and IPv6 The callback for adding vxlan port can be called with the same port for both IPv4 and IPv6. Do not disable the offloading if this occurs. Signed-off-by: Jiri Benc Acked-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be.h | 1 + drivers/net/ethernet/emulex/benet/be_main.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 0a27805cbbbd..821540913343 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -582,6 +582,7 @@ struct be_adapter { u16 pvid; __be16 vxlan_port; int vxlan_port_count; + int vxlan_port_aliases; struct phy_info phy; u8 wol_cap; bool wol_en; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 12687bf52b95..7bf51a1a0a77 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5176,6 +5176,11 @@ static void be_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family, if (lancer_chip(adapter) || BEx_chip(adapter) || be_is_mc(adapter)) return; + if (adapter->vxlan_port == port && adapter->vxlan_port_count) { + adapter->vxlan_port_aliases++; + return; + } + if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS) { dev_info(dev, "Only one UDP port supported for VxLAN offloads\n"); @@ -5226,6 +5231,11 @@ static void be_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family, if (adapter->vxlan_port != port) goto done; + if (adapter->vxlan_port_aliases) { + adapter->vxlan_port_aliases--; + return; + } + be_disable_vxlan_offloads(adapter); dev_info(&adapter->pdev->dev, From ac7eccd4d48fcc70d9fd6e4d10657bcde0a73f9f Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:11:14 +0200 Subject: [PATCH 042/116] bnx2x: track vxlan port count The callback for adding vxlan port can be called with the same port for both IPv4 and IPv6. Do not disable the offloading when the same port for both protocols is added and later one of them removed. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 1 + drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index ba936635322a..b5e64b02200c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -1946,6 +1946,7 @@ struct bnx2x { u16 vlan_cnt; u16 vlan_credit; u16 vxlan_dst_port; + u8 vxlan_dst_port_count; bool accept_any_vlan; }; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 89a174fa1300..f1d62d5dbaff 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -10108,12 +10108,18 @@ static void __bnx2x_add_vxlan_port(struct bnx2x *bp, u16 port) if (!netif_running(bp->dev)) return; - if (bp->vxlan_dst_port || !IS_PF(bp)) { + if (bp->vxlan_dst_port_count && bp->vxlan_dst_port == port) { + bp->vxlan_dst_port_count++; + return; + } + + if (bp->vxlan_dst_port_count || !IS_PF(bp)) { DP(BNX2X_MSG_SP, "Vxlan destination port limit reached\n"); return; } bp->vxlan_dst_port = port; + bp->vxlan_dst_port_count = 1; bnx2x_schedule_sp_rtnl(bp, BNX2X_SP_RTNL_ADD_VXLAN_PORT, 0); } @@ -10128,10 +10134,14 @@ static void bnx2x_add_vxlan_port(struct net_device *netdev, static void __bnx2x_del_vxlan_port(struct bnx2x *bp, u16 port) { - if (!bp->vxlan_dst_port || bp->vxlan_dst_port != port || !IS_PF(bp)) { + if (!bp->vxlan_dst_port_count || bp->vxlan_dst_port != port || + !IS_PF(bp)) { DP(BNX2X_MSG_SP, "Invalid vxlan port\n"); return; } + bp->vxlan_dst_port--; + if (bp->vxlan_dst_port) + return; if (netif_running(bp->dev)) { bnx2x_schedule_sp_rtnl(bp, BNX2X_SP_RTNL_DEL_VXLAN_PORT, 0); From 6cf35642147103195f126d13bdc1d4c32c7c6666 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 17 Sep 2015 16:28:31 +0200 Subject: [PATCH 043/116] MAINTAINERS: remove bouncing email address for qlcnic I got this automated message from when submitting a qlcnic patch: > Shahed Shaikh is no longer with QLogic. If you require assistance please > contact Ariel Elior Ariel.Elior@qlogic.com There's no point in having a bouncing address in MAINTAINERS. CC: Dept-GELinuxNICDev@qlogic.com CC: Ariel Elior Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c6aca87cd48a..e1c9fbb8bf92 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8497,7 +8497,6 @@ F: Documentation/networking/LICENSE.qla3xxx F: drivers/net/ethernet/qlogic/qla3xxx.* QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER -M: Shahed Shaikh M: Dept-GELinuxNICDev@qlogic.com L: netdev@vger.kernel.org S: Supported From adb094e5e7285385770eb7a7c122bfc663c5e174 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Thu, 17 Sep 2015 23:21:21 +0900 Subject: [PATCH 044/116] fjes: fix off-by-one error at fjes_hw_update_zone_task() Dan Carpenter reported off-by-one error of fjes at http://www.mail-archive.com/netdev@vger.kernel.org/msg77520.html Actually this is a bug. ep_shm_info[epidx].{es_status, zone} should be update inside for loop. This patch fixes this bug. Reported-by: Dan Carpenter Signed-off-by: Taku Izumi Signed-off-by: David S. Miller --- drivers/net/fjes/fjes_hw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c index b5f4a78da828..2d3848c9dc35 100644 --- a/drivers/net/fjes/fjes_hw.c +++ b/drivers/net/fjes/fjes_hw.c @@ -1011,11 +1011,11 @@ static void fjes_hw_update_zone_task(struct work_struct *work) set_bit(epidx, &irq_bit); break; } + + hw->ep_shm_info[epidx].es_status = + info[epidx].es_status; + hw->ep_shm_info[epidx].zone = info[epidx].zone; } - - hw->ep_shm_info[epidx].es_status = info[epidx].es_status; - hw->ep_shm_info[epidx].zone = info[epidx].zone; - break; } From c2e7204d180f8efc80f27959ca9cf16fa17f67db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Sep 2015 08:38:00 -0700 Subject: [PATCH 045/116] tcp_cubic: do not set epoch_start in the future Tracking idle time in bictcp_cwnd_event() is imprecise, as epoch_start is normally set at ACK processing time, not at send time. Doing a proper fix would need to add an additional state variable, and does not seem worth the trouble, given CUBIC bug has been there forever before Jana noticed it. Let's simply not set epoch_start in the future, otherwise bictcp_update() could overflow and CUBIC would again grow cwnd too fast. This was detected thanks to a packetdrill test Neal wrote that was flaky before applying this fix. Fixes: 30927520dbae ("tcp_cubic: better follow cubic curve after idle period") Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Jana Iyengar Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c6ded6b2a79f..448c2615fece 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -154,14 +154,20 @@ static void bictcp_init(struct sock *sk) static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime; struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ - if (ca->epoch_start && delta > 0) + if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } return; } } From ba5ca7848be05db6235aeb703586b821aa00e381 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 16 Sep 2015 15:27:43 +0200 Subject: [PATCH 046/116] bna: check for dma mapping errors Check for DMA mapping errors, recover from them and register them in ethtool stats like other errors. Cc: Rasesh Mody Signed-off-by: Ivan Vecera Acked-by: Rasesh Mody Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bna_tx_rx.c | 2 ++ drivers/net/ethernet/brocade/bna/bna_types.h | 1 + drivers/net/ethernet/brocade/bna/bnad.c | 29 ++++++++++++++++++- drivers/net/ethernet/brocade/bna/bnad.h | 2 ++ .../net/ethernet/brocade/bna/bnad_ethtool.c | 4 +++ 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c index 5d0753cc7e73..04b0d16b210e 100644 --- a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c +++ b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c @@ -2400,6 +2400,7 @@ bna_rx_create(struct bna *bna, struct bnad *bnad, q0->rcb->id = 0; q0->rx_packets = q0->rx_bytes = 0; q0->rx_packets_with_error = q0->rxbuf_alloc_failed = 0; + q0->rxbuf_map_failed = 0; bna_rxq_qpt_setup(q0, rxp, dpage_count, PAGE_SIZE, &dqpt_mem[i], &dsqpt_mem[i], &dpage_mem[i]); @@ -2428,6 +2429,7 @@ bna_rx_create(struct bna *bna, struct bnad *bnad, : rx_cfg->q1_buf_size; q1->rx_packets = q1->rx_bytes = 0; q1->rx_packets_with_error = q1->rxbuf_alloc_failed = 0; + q1->rxbuf_map_failed = 0; bna_rxq_qpt_setup(q1, rxp, hpage_count, PAGE_SIZE, &hqpt_mem[i], &hsqpt_mem[i], diff --git a/drivers/net/ethernet/brocade/bna/bna_types.h b/drivers/net/ethernet/brocade/bna/bna_types.h index e0e797f2ea14..c438d032e8bf 100644 --- a/drivers/net/ethernet/brocade/bna/bna_types.h +++ b/drivers/net/ethernet/brocade/bna/bna_types.h @@ -587,6 +587,7 @@ struct bna_rxq { u64 rx_bytes; u64 rx_packets_with_error; u64 rxbuf_alloc_failed; + u64 rxbuf_map_failed; }; /* RxQ pair */ diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index 506047c38607..21a0cfc3e7ec 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -399,7 +399,13 @@ bnad_rxq_refill_page(struct bnad *bnad, struct bna_rcb *rcb, u32 nalloc) } dma_addr = dma_map_page(&bnad->pcidev->dev, page, page_offset, - unmap_q->map_size, DMA_FROM_DEVICE); + unmap_q->map_size, DMA_FROM_DEVICE); + if (dma_mapping_error(&bnad->pcidev->dev, dma_addr)) { + put_page(page); + BNAD_UPDATE_CTR(bnad, rxbuf_map_failed); + rcb->rxq->rxbuf_map_failed++; + goto finishing; + } unmap->page = page; unmap->page_offset = page_offset; @@ -454,8 +460,15 @@ bnad_rxq_refill_skb(struct bnad *bnad, struct bna_rcb *rcb, u32 nalloc) rcb->rxq->rxbuf_alloc_failed++; goto finishing; } + dma_addr = dma_map_single(&bnad->pcidev->dev, skb->data, buff_sz, DMA_FROM_DEVICE); + if (dma_mapping_error(&bnad->pcidev->dev, dma_addr)) { + dev_kfree_skb_any(skb); + BNAD_UPDATE_CTR(bnad, rxbuf_map_failed); + rcb->rxq->rxbuf_map_failed++; + goto finishing; + } unmap->skb = skb; dma_unmap_addr_set(&unmap->vector, dma_addr, dma_addr); @@ -3025,6 +3038,11 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) unmap = head_unmap; dma_addr = dma_map_single(&bnad->pcidev->dev, skb->data, len, DMA_TO_DEVICE); + if (dma_mapping_error(&bnad->pcidev->dev, dma_addr)) { + dev_kfree_skb_any(skb); + BNAD_UPDATE_CTR(bnad, tx_skb_map_failed); + return NETDEV_TX_OK; + } BNA_SET_DMA_ADDR(dma_addr, &txqent->vector[0].host_addr); txqent->vector[0].length = htons(len); dma_unmap_addr_set(&unmap->vectors[0], dma_addr, dma_addr); @@ -3056,6 +3074,15 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) dma_addr = skb_frag_dma_map(&bnad->pcidev->dev, frag, 0, size, DMA_TO_DEVICE); + if (dma_mapping_error(&bnad->pcidev->dev, dma_addr)) { + /* Undo the changes starting at tcb->producer_index */ + bnad_tx_buff_unmap(bnad, unmap_q, q_depth, + tcb->producer_index); + dev_kfree_skb_any(skb); + BNAD_UPDATE_CTR(bnad, tx_skb_map_failed); + return NETDEV_TX_OK; + } + dma_unmap_len_set(&unmap->vectors[vect_id], dma_len, size); BNA_SET_DMA_ADDR(dma_addr, &txqent->vector[vect_id].host_addr); txqent->vector[vect_id].length = htons(size); diff --git a/drivers/net/ethernet/brocade/bna/bnad.h b/drivers/net/ethernet/brocade/bna/bnad.h index faedbf24777e..f4ed816b93ee 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.h +++ b/drivers/net/ethernet/brocade/bna/bnad.h @@ -175,6 +175,7 @@ struct bnad_drv_stats { u64 tx_skb_headlen_zero; u64 tx_skb_frag_zero; u64 tx_skb_len_mismatch; + u64 tx_skb_map_failed; u64 hw_stats_updates; u64 netif_rx_dropped; @@ -189,6 +190,7 @@ struct bnad_drv_stats { u64 rx_unmap_q_alloc_failed; u64 rxbuf_alloc_failed; + u64 rxbuf_map_failed; }; /* Complete driver stats */ diff --git a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c index 2bdfc5dff4b1..0e4fdc3dd729 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_ethtool.c +++ b/drivers/net/ethernet/brocade/bna/bnad_ethtool.c @@ -90,6 +90,7 @@ static const char *bnad_net_stats_strings[BNAD_ETHTOOL_STATS_NUM] = { "tx_skb_headlen_zero", "tx_skb_frag_zero", "tx_skb_len_mismatch", + "tx_skb_map_failed", "hw_stats_updates", "netif_rx_dropped", @@ -102,6 +103,7 @@ static const char *bnad_net_stats_strings[BNAD_ETHTOOL_STATS_NUM] = { "tx_unmap_q_alloc_failed", "rx_unmap_q_alloc_failed", "rxbuf_alloc_failed", + "rxbuf_map_failed", "mac_stats_clr_cnt", "mac_frame_64", @@ -807,6 +809,7 @@ bnad_per_q_stats_fill(struct bnad *bnad, u64 *buf, int bi) rx_packets_with_error; buf[bi++] = rcb->rxq-> rxbuf_alloc_failed; + buf[bi++] = rcb->rxq->rxbuf_map_failed; buf[bi++] = rcb->producer_index; buf[bi++] = rcb->consumer_index; } @@ -821,6 +824,7 @@ bnad_per_q_stats_fill(struct bnad *bnad, u64 *buf, int bi) rx_packets_with_error; buf[bi++] = rcb->rxq-> rxbuf_alloc_failed; + buf[bi++] = rcb->rxq->rxbuf_map_failed; buf[bi++] = rcb->producer_index; buf[bi++] = rcb->consumer_index; } From 0315e382704817b279e5693dca8ab9d89aa20b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nikola=20Forr=C3=B3?= Date: Thu, 17 Sep 2015 16:01:32 +0200 Subject: [PATCH 047/116] net: Fix behaviour of unreachable, blackhole and prohibit routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Man page of ip-route(8) says following about route types: unreachable - these destinations are unreachable. Packets are dis‐ carded and the ICMP message host unreachable is generated. The local senders get an EHOSTUNREACH error. blackhole - these destinations are unreachable. Packets are dis‐ carded silently. The local senders get an EINVAL error. prohibit - these destinations are unreachable. Packets are discarded and the ICMP message communication administratively prohibited is generated. The local senders get an EACCES error. In the inet6 address family, this was correct, except the local senders got ENETUNREACH error instead of EHOSTUNREACH in case of unreachable route. In the inet address family, all three route types generated ICMP message net unreachable, and the local senders got ENETUNREACH error. In both address families all three route types now behave consistently with documentation. Signed-off-by: Nikola Forró Signed-off-by: David S. Miller --- include/net/ip_fib.h | 30 +++++++++++++++++++----------- net/ipv4/route.c | 6 ++++-- net/ipv6/route.c | 4 +++- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a37d0432bebd..727d6e9a9685 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -236,8 +236,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); - if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF)) - err = 0; + if (tb) + err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); + + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); @@ -258,7 +261,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; - int err; + int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) @@ -268,15 +271,20 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, res->tclassid = 0; - for (err = 0; !err; err = -ENETUNREACH) { - tb = rcu_dereference_rtnl(net->ipv4.fib_main); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; + tb = rcu_dereference_rtnl(net->ipv4.fib_main); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); - tb = rcu_dereference_rtnl(net->ipv4.fib_default); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; - } + if (!err) + goto out; + + tb = rcu_dereference_rtnl(net->ipv4.fib_default); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); + +out: + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5f4a5565ad8b..c6ad99ad0ffb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2045,6 +2045,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2153,7 +2154,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2181,7 +2183,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d5fa50297f80..f204089e854c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1885,9 +1885,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; From fc27bd115b334e3ebdc682a42a47c3aea2566dcc Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 18 Sep 2015 00:19:08 +0100 Subject: [PATCH 048/116] 8139cp: Use dev_kfree_skb_any() instead of dev_kfree_skb() in cp_clean_rings() This can be called from cp_tx_timeout() with interrupts disabled. Spotted by Francois Romieu Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index d79e33b3c191..52a533445f30 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1151,7 +1151,7 @@ static void cp_clean_rings (struct cp_private *cp) desc = cp->rx_ring + i; dma_unmap_single(&cp->pdev->dev,le64_to_cpu(desc->addr), cp->rx_buf_sz, PCI_DMA_FROMDEVICE); - dev_kfree_skb(cp->rx_skb[i]); + dev_kfree_skb_any(cp->rx_skb[i]); } } @@ -1164,7 +1164,7 @@ static void cp_clean_rings (struct cp_private *cp) le32_to_cpu(desc->opts1) & 0xffff, PCI_DMA_TODEVICE); if (le32_to_cpu(desc->opts1) & LastFrag) - dev_kfree_skb(skb); + dev_kfree_skb_any(skb); cp->dev->stats.tx_dropped++; } } From 7a8a8e75d505147358b225173e890ada43a267e2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 18 Sep 2015 00:21:54 +0100 Subject: [PATCH 049/116] 8139cp: Call __cp_set_rx_mode() from cp_tx_timeout() Unless we reset the RX config, on real hardware I don't seem to receive any packets after a TX timeout. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 52a533445f30..ba3dab721806 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1261,6 +1261,7 @@ static void cp_tx_timeout(struct net_device *dev) cp_clean_rings(cp); rc = cp_init_rings(cp); cp_start_hw(cp); + __cp_set_rx_mode(dev); cp_enable_irq(cp); netif_wake_queue(dev); From aab0c0e62ec4af224d1b6b40fca65055d403400b Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Fri, 18 Sep 2015 15:42:30 +0800 Subject: [PATCH 050/116] Revert "net/phy: Add Vitesse 8641 phy ID" This reverts commit 1298267b548a78840bd4b3e030993ff8747ca5e6. That commit claim that the Vitesse VSC8641 is compatible with Vitesse 82xx. But this is not true. It seems that all the registers used in Vitesse phy driver are not compatible between 8641 and 82xx. It does cause malfunction of the Ethernet on p1010rdb-pa board. So we definitely need a rework in order to support the 8641 phy in this driver. Signed-off-by: Kevin Hao Signed-off-by: David S. Miller --- drivers/net/phy/vitesse.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/phy/vitesse.c b/drivers/net/phy/vitesse.c index 17cad185169d..76cad712ddb2 100644 --- a/drivers/net/phy/vitesse.c +++ b/drivers/net/phy/vitesse.c @@ -66,7 +66,6 @@ #define PHY_ID_VSC8244 0x000fc6c0 #define PHY_ID_VSC8514 0x00070670 #define PHY_ID_VSC8574 0x000704a0 -#define PHY_ID_VSC8641 0x00070431 #define PHY_ID_VSC8662 0x00070660 #define PHY_ID_VSC8221 0x000fc550 #define PHY_ID_VSC8211 0x000fc4b0 @@ -272,18 +271,6 @@ static struct phy_driver vsc82xx_driver[] = { .ack_interrupt = &vsc824x_ack_interrupt, .config_intr = &vsc82xx_config_intr, .driver = { .owner = THIS_MODULE,}, -}, { - .phy_id = PHY_ID_VSC8641, - .name = "Vitesse VSC8641", - .phy_id_mask = 0x000ffff0, - .features = PHY_GBIT_FEATURES, - .flags = PHY_HAS_INTERRUPT, - .config_init = &vsc824x_config_init, - .config_aneg = &vsc82x4_config_aneg, - .read_status = &genphy_read_status, - .ack_interrupt = &vsc824x_ack_interrupt, - .config_intr = &vsc82xx_config_intr, - .driver = { .owner = THIS_MODULE,}, }, { .phy_id = PHY_ID_VSC8662, .name = "Vitesse VSC8662", @@ -331,7 +318,6 @@ static struct mdio_device_id __maybe_unused vitesse_tbl[] = { { PHY_ID_VSC8244, 0x000fffc0 }, { PHY_ID_VSC8514, 0x000ffff0 }, { PHY_ID_VSC8574, 0x000ffff0 }, - { PHY_ID_VSC8641, 0x000ffff0 }, { PHY_ID_VSC8662, 0x000ffff0 }, { PHY_ID_VSC8221, 0x000ffff0 }, { PHY_ID_VSC8211, 0x000ffff0 }, From 4e3ae00100945d39e1f83b7c0179a114ccf55759 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Fri, 18 Sep 2015 10:46:31 +0200 Subject: [PATCH 051/116] tipc: reinitialize pointer after skb linearize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The msg pointer into header may change after skb linearization. We must reinitialize it after calling skb_linearize to prevent operating on a freed or invalid pointer. Signed-off-by: Erik Hugne Reported-by: Tamás Végh Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/msg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 562c926a51cc..c5ac436235e0 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -539,6 +539,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; + msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; dnode = addr_domain(net, msg_lookup_scope(msg)); From bc22a0e2ea03b75b51a1f722f93821744b5b5ff1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 18 Sep 2015 11:47:40 +0200 Subject: [PATCH 052/116] iptunnel: make rx/tx bytes counters consistent This was already done a long time ago in commit 64194c31a0b6 ("inet: Make tunnel RX/TX byte counters more consistent") but tx path was broken (at least since 3.10). Before the patch the gre header was included on tx. After the patch: $ ping -c1 192.168.0.121 ; ip -s l ls dev gre1 PING 192.168.0.121 (192.168.0.121) 56(84) bytes of data. 64 bytes from 192.168.0.121: icmp_req=1 ttl=64 time=2.95 ms --- 192.168.0.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 2.955/2.955/2.955/0.000 ms 7: gre1@NONE: mtu 1468 qdisc noqueue state UNKNOWN mode DEFAULT group default link/gre 10.16.0.249 peer 10.16.0.121 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 Reported-by: Julien Meunier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 29ed6c5a5185..9b97204b8c81 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -51,7 +51,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); struct iphdr *iph; int err; From 83cf9a2521b0934a5f9d04082c9bb4f554fddcd4 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 18 Sep 2015 11:47:41 +0200 Subject: [PATCH 053/116] ip6tunnel: make rx/tx bytes counters consistent Like the previous patch, which fixes ipv4 tunnels, here is the ipv6 part. Before the patch, the external ipv6 header + gre header were included on tx. After the patch: $ ping -c1 192.168.6.121 ; ip -s l ls dev ip6gre1 PING 192.168.6.121 (192.168.6.121) 56(84) bytes of data. 64 bytes from 192.168.6.121: icmp_req=1 ttl=64 time=1.92 ms --- 192.168.6.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 1.923/1.923/1.923/0.000 ms 7: ip6gre1@NONE: mtu 1440 qdisc noqueue state UNKNOWN mode DEFAULT group default link/gre6 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:23 peer 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:21 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 $ ping -c1 192.168.1.121 ; ip -s l ls dev ip6tnl1 PING 192.168.1.121 (192.168.1.121) 56(84) bytes of data. 64 bytes from 192.168.1.121: icmp_req=1 ttl=64 time=2.28 ms --- 192.168.1.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 2.288/2.288/2.288/0.000 ms 8: ip6tnl1@NONE: mtu 1452 qdisc noqueue state UNKNOWN mode DEFAULT group default link/tunnel6 2001:660:3008:c1c3::123 peer 2001:660:3008:c1c3::121 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 65c2a9397b3c..fa915fa0f703 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -86,7 +86,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device_stats *stats = &dev->stats; int pkt_len, err; - pkt_len = skb->len; + pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out_sk(sk, skb); if (net_xmit_eval(err) == 0) { From 3ea79249e81e5ed051f2e6480cbde896d99046e8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 18 Sep 2015 13:41:09 +0300 Subject: [PATCH 054/116] macvtap: fix TUNSETSNDBUF values > 64k Upon TUNSETSNDBUF, macvtap reads the requested sndbuf size into a local variable u. commit 39ec7de7092b ("macvtap: fix uninitialized access on TUNSETIFF") changed its type to u16 (which is the right thing to do for all other macvtap ioctls), breaking all values > 64k. The value of TUNSETSNDBUF is actually a signed 32 bit integer, so the right thing to do is to read it into an int. Cc: David S. Miller Fixes: 39ec7de7092b ("macvtap: fix uninitialized access on TUNSETIFF") Reported-by: Mark A. Peloquin Bisected-by: Matthew Rosato Reported-by: Christian Borntraeger Signed-off-by: Michael S. Tsirkin Tested-by: Matthew Rosato Acked-by: Christian Borntraeger Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index edd77342773a..248478c6f6e4 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -1111,10 +1111,10 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, return 0; case TUNSETSNDBUF: - if (get_user(u, up)) + if (get_user(s, sp)) return -EFAULT; - q->sk.sk_sndbuf = u; + q->sk.sk_sndbuf = s; return 0; case TUNGETVNETHDRSZ: From 1f770c0a09da855a2b51af6d19de97fb955eca85 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 18 Sep 2015 19:16:50 +0800 Subject: [PATCH 055/116] netlink: Fix autobind race condition that leads to zero port ID The commit c0bb07df7d981e4091432754e30c9c720e2c0c78 ("netlink: Reset portid after netlink_insert failure") introduced a race condition where if two threads try to autobind the same socket one of them may end up with a zero port ID. This led to kernel deadlocks that were observed by multiple people. This patch reverts that commit and instead fixes it by introducing a separte rhash_portid variable so that the real portid is only set after the socket has been successfully hashed. Fixes: c0bb07df7d98 ("netlink: Reset portid after netlink_insert failure") Reported-by: Tejun Heo Reported-by: Linus Torvalds Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 12 +++++++----- net/netlink/af_netlink.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4cad99d6c68b..9f51608b968a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1031,7 +1031,7 @@ static inline int netlink_compare(struct rhashtable_compare_arg *arg, const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; - return nlk->portid != x->portid || + return nlk->rhash_portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } @@ -1057,7 +1057,7 @@ static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->rhash_portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); @@ -1119,7 +1119,7 @@ static int netlink_insert(struct sock *sk, u32 portid) unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; - nlk_sk(sk)->portid = portid; + nlk_sk(sk)->rhash_portid = portid; sock_hold(sk); err = __netlink_insert(table, sk); @@ -1131,10 +1131,12 @@ static int netlink_insert(struct sock *sk, u32 portid) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; - nlk_sk(sk)->portid = 0; sock_put(sk); + goto err; } + nlk_sk(sk)->portid = portid; + err: release_sock(sk); return err; @@ -3271,7 +3273,7 @@ static inline u32 netlink_hash(const void *data, u32 len, u32 seed) const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->rhash_portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index df9a06090db6..80b2b7526dfd 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -25,6 +25,7 @@ struct netlink_ring { struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; + u32 rhash_portid; u32 portid; u32 dst_portid; u32 dst_group; From c7d778fa74a88e7b5a09ba498d66bd67a7dc7df0 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 17:54:00 +0200 Subject: [PATCH 056/116] net: arc: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Signed-off-by: David S. Miller --- drivers/net/ethernet/arc/emac_arc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/arc/emac_arc.c b/drivers/net/ethernet/arc/emac_arc.c index f9cb99bfb511..ffd180570920 100644 --- a/drivers/net/ethernet/arc/emac_arc.c +++ b/drivers/net/ethernet/arc/emac_arc.c @@ -78,6 +78,7 @@ static const struct of_device_id emac_arc_dt_ids[] = { { .compatible = "snps,arc-emac" }, { /* Sentinel */ } }; +MODULE_DEVICE_TABLE(of, emac_arc_dt_ids); static struct platform_driver emac_arc_driver = { .probe = emac_arc_probe, From 46d5a3431f2dbb2392b501be9b9f9c2ca17737d8 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 17:54:30 +0200 Subject: [PATCH 057/116] net: systemport: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index b9a5a97ed4dd..f1b5364f3521 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2079,6 +2079,7 @@ static const struct of_device_id bcm_sysport_of_match[] = { { .compatible = "brcm,systemport" }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, bcm_sysport_of_match); static struct platform_driver bcm_sysport_driver = { .probe = bcm_sysport_probe, From e8048e5595bedb1f4b20597d059d6877a710f12d Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 17:55:02 +0200 Subject: [PATCH 058/116] net: bcmgenet: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index fadbd0088d3e..3bc701e4c59e 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3155,6 +3155,7 @@ static const struct of_device_id bcmgenet_match[] = { { .compatible = "brcm,genet-v4", .data = (void *)GENET_V4 }, { }, }; +MODULE_DEVICE_TABLE(of, bcmgenet_match); static int bcmgenet_probe(struct platform_device *pdev) { From 23860063706f2037008fb423820cdbd0dcd558bb Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 17:55:27 +0200 Subject: [PATCH 059/116] net: gianfar_ptp: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ptp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c index 8e3cd77aa347..664d0c261269 100644 --- a/drivers/net/ethernet/freescale/gianfar_ptp.c +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c @@ -557,6 +557,7 @@ static const struct of_device_id match_table[] = { { .compatible = "fsl,etsec-ptp" }, {}, }; +MODULE_DEVICE_TABLE(of, match_table); static struct platform_driver gianfar_ptp_driver = { .driver = { From ebd8ebf078879973d0e8d2642253c091c23bd7d2 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 17:56:21 +0200 Subject: [PATCH 060/116] net: moxa: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Signed-off-by: David S. Miller --- drivers/net/ethernet/moxa/moxart_ether.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c index becbb5f1f5a7..a10c928bbd6b 100644 --- a/drivers/net/ethernet/moxa/moxart_ether.c +++ b/drivers/net/ethernet/moxa/moxart_ether.c @@ -552,6 +552,7 @@ static const struct of_device_id moxart_mac_match[] = { { .compatible = "moxa,moxart-mac" }, { } }; +MODULE_DEVICE_TABLE(of, moxart_mac_match); static struct platform_driver moxart_mac_driver = { .probe = moxart_mac_probe, From 2f90a3070690ad80c38d650e91b96b5dcbdfe2fd Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 18:16:12 +0200 Subject: [PATCH 061/116] net: phy: mdio-bcm-unimac: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mdio-bcm-unimac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/mdio-bcm-unimac.c b/drivers/net/phy/mdio-bcm-unimac.c index 6a52a7f0fa0d..4bde5e728fe0 100644 --- a/drivers/net/phy/mdio-bcm-unimac.c +++ b/drivers/net/phy/mdio-bcm-unimac.c @@ -244,6 +244,7 @@ static const struct of_device_id unimac_mdio_ids[] = { { .compatible = "brcm,unimac-mdio", }, { /* sentinel */ }, }; +MODULE_DEVICE_TABLE(of, unimac_mdio_ids); static struct platform_driver unimac_mdio_driver = { .driver = { From 1ccb141e31d0dc4c1234b0886cdc8b4399112c59 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Fri, 18 Sep 2015 18:16:29 +0200 Subject: [PATCH 062/116] net: phy: mdio-gpio: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 7dc21e56a7aa..3bc9f03349f3 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -261,6 +261,7 @@ static const struct of_device_id mdio_gpio_of_match[] = { { .compatible = "virtual,mdio-gpio", }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, mdio_gpio_of_match); static struct platform_driver mdio_gpio_driver = { .probe = mdio_gpio_probe, From 4a476bd6d1d923922ec950ddc4c27b279f6901eb Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Sun, 20 Sep 2015 02:25:38 -0700 Subject: [PATCH 063/116] usbnet: New driver for QinHeng CH9200 devices There's a bunch of cheap USB 10/100 devices based on QinHeng chipsets. The vendor driver supports the CH9100 and CH9200 devices, but the majority of the code is of the if (ch9100) {} else {} form, with the most significant difference being that CH9200 provides a real MII interface but CH9100 fakes one with a bunch of global variables and magic commands. I don't have a CH9100, so it's probably better if someone who does provides an independent driver for it. In any case, this is a lightly cleaned up version of the vendor driver with all the CH9100 code dropped. Signed-off-by: Matthew Garrett Signed-off-by: David S. Miller --- drivers/net/usb/Kconfig | 11 + drivers/net/usb/Makefile | 2 +- drivers/net/usb/ch9200.c | 443 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 drivers/net/usb/ch9200.c diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 1610b79ae386..fbb9325d1f6e 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -583,4 +583,15 @@ config USB_VL600 http://ubuntuforums.org/showpost.php?p=10589647&postcount=17 +config USB_NET_CH9200 + tristate "QingHeng CH9200 USB ethernet support" + depends on USB_USBNET + select MII + help + Choose this option if you have a USB ethernet adapter with a QinHeng + CH9200 chipset. + + To compile this driver as a module, choose M here: the + module will be called ch9200. + endif # USB_NET_DRIVERS diff --git a/drivers/net/usb/Makefile b/drivers/net/usb/Makefile index cf6a0e610a7f..b5f04068dbe4 100644 --- a/drivers/net/usb/Makefile +++ b/drivers/net/usb/Makefile @@ -38,4 +38,4 @@ obj-$(CONFIG_USB_NET_HUAWEI_CDC_NCM) += huawei_cdc_ncm.o obj-$(CONFIG_USB_VL600) += lg-vl600.o obj-$(CONFIG_USB_NET_QMI_WWAN) += qmi_wwan.o obj-$(CONFIG_USB_NET_CDC_MBIM) += cdc_mbim.o - +obj-$(CONFIG_USB_NET_CH9200) += ch9200.o diff --git a/drivers/net/usb/ch9200.c b/drivers/net/usb/ch9200.c new file mode 100644 index 000000000000..cabb670eb55c --- /dev/null +++ b/drivers/net/usb/ch9200.c @@ -0,0 +1,443 @@ +/* + * USB 10M/100M ethernet adapter + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CH9200_VID 0x1A86 +#define CH9200_PID_E092 0xE092 + +#define CTRL_TIMEOUT_MS 1000 + +#define CONTROL_TIMEOUT_MS 1000 + +#define REQUEST_READ 0x0E +#define REQUEST_WRITE 0x0F + +/* Address space: + * 00-63 : MII + * 64-128: MAC + * + * Note: all accesses must be 16-bit + */ + +#define MAC_REG_CTRL 64 +#define MAC_REG_STATUS 66 +#define MAC_REG_INTERRUPT_MASK 68 +#define MAC_REG_PHY_COMMAND 70 +#define MAC_REG_PHY_DATA 72 +#define MAC_REG_STATION_L 74 +#define MAC_REG_STATION_M 76 +#define MAC_REG_STATION_H 78 +#define MAC_REG_HASH_L 80 +#define MAC_REG_HASH_M1 82 +#define MAC_REG_HASH_M2 84 +#define MAC_REG_HASH_H 86 +#define MAC_REG_THRESHOLD 88 +#define MAC_REG_FIFO_DEPTH 90 +#define MAC_REG_PAUSE 92 +#define MAC_REG_FLOW_CONTROL 94 + +/* Control register bits + * + * Note: bits 13 and 15 are reserved + */ +#define LOOPBACK (0x01 << 14) +#define BASE100X (0x01 << 12) +#define MBPS_10 (0x01 << 11) +#define DUPLEX_MODE (0x01 << 10) +#define PAUSE_FRAME (0x01 << 9) +#define PROMISCUOUS (0x01 << 8) +#define MULTICAST (0x01 << 7) +#define BROADCAST (0x01 << 6) +#define HASH (0x01 << 5) +#define APPEND_PAD (0x01 << 4) +#define APPEND_CRC (0x01 << 3) +#define TRANSMITTER_ACTION (0x01 << 2) +#define RECEIVER_ACTION (0x01 << 1) +#define DMA_ACTION (0x01 << 0) + +/* Status register bits + * + * Note: bits 7-15 are reserved + */ +#define ALIGNMENT (0x01 << 6) +#define FIFO_OVER_RUN (0x01 << 5) +#define FIFO_UNDER_RUN (0x01 << 4) +#define RX_ERROR (0x01 << 3) +#define RX_COMPLETE (0x01 << 2) +#define TX_ERROR (0x01 << 1) +#define TX_COMPLETE (0x01 << 0) + +/* FIFO depth register bits + * + * Note: bits 6 and 14 are reserved + */ + +#define ETH_TXBD (0x01 << 15) +#define ETN_TX_FIFO_DEPTH (0x01 << 8) +#define ETH_RXBD (0x01 << 7) +#define ETH_RX_FIFO_DEPTH (0x01 << 0) + +static int control_read(struct usbnet *dev, + unsigned char request, unsigned short value, + unsigned short index, void *data, unsigned short size, + int timeout) +{ + unsigned char *buf = NULL; + unsigned char request_type; + int err = 0; + + if (request == REQUEST_READ) + request_type = (USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_OTHER); + else + request_type = (USB_DIR_IN | USB_TYPE_VENDOR | + USB_RECIP_DEVICE); + + netdev_dbg(dev->net, "Control_read() index=0x%02x size=%d\n", + index, size); + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto err_out; + } + + err = usb_control_msg(dev->udev, + usb_rcvctrlpipe(dev->udev, 0), + request, request_type, value, index, buf, size, + timeout); + if (err == size) + memcpy(data, buf, size); + else if (err >= 0) + err = -EINVAL; + kfree(buf); + + return err; + +err_out: + return err; +} + +static int control_write(struct usbnet *dev, unsigned char request, + unsigned short value, unsigned short index, + void *data, unsigned short size, int timeout) +{ + unsigned char *buf = NULL; + unsigned char request_type; + int err = 0; + + if (request == REQUEST_WRITE) + request_type = (USB_DIR_OUT | USB_TYPE_VENDOR | + USB_RECIP_OTHER); + else + request_type = (USB_DIR_OUT | USB_TYPE_VENDOR | + USB_RECIP_DEVICE); + + netdev_dbg(dev->net, "Control_write() index=0x%02x size=%d\n", + index, size); + + if (data) { + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto err_out; + } + memcpy(buf, data, size); + } + + err = usb_control_msg(dev->udev, + usb_sndctrlpipe(dev->udev, 0), + request, request_type, value, index, buf, size, + timeout); + if (err >= 0 && err < size) + err = -EINVAL; + kfree(buf); + + return 0; + +err_out: + return err; +} + +static int ch9200_mdio_read(struct net_device *netdev, int phy_id, int loc) +{ + struct usbnet *dev = netdev_priv(netdev); + unsigned char buff[2]; + + netdev_dbg(netdev, "ch9200_mdio_read phy_id:%02x loc:%02x\n", + phy_id, loc); + + if (phy_id != 0) + return -ENODEV; + + control_read(dev, REQUEST_READ, 0, loc * 2, buff, 0x02, + CONTROL_TIMEOUT_MS); + + return (buff[0] | buff[1] << 8); +} + +static void ch9200_mdio_write(struct net_device *netdev, + int phy_id, int loc, int val) +{ + struct usbnet *dev = netdev_priv(netdev); + unsigned char buff[2]; + + netdev_dbg(netdev, "ch9200_mdio_write() phy_id=%02x loc:%02x\n", + phy_id, loc); + + if (phy_id != 0) + return; + + buff[0] = (unsigned char)val; + buff[1] = (unsigned char)(val >> 8); + + control_write(dev, REQUEST_WRITE, 0, loc * 2, buff, 0x02, + CONTROL_TIMEOUT_MS); +} + +static int ch9200_link_reset(struct usbnet *dev) +{ + struct ethtool_cmd ecmd; + + mii_check_media(&dev->mii, 1, 1); + mii_ethtool_gset(&dev->mii, &ecmd); + + netdev_dbg(dev->net, "link_reset() speed:%d duplex:%d\n", + ecmd.speed, ecmd.duplex); + + return 0; +} + +static void ch9200_status(struct usbnet *dev, struct urb *urb) +{ + int link; + unsigned char *buf; + + if (urb->actual_length < 16) + return; + + buf = urb->transfer_buffer; + link = !!(buf[0] & 0x01); + + if (link) { + netif_carrier_on(dev->net); + usbnet_defer_kevent(dev, EVENT_LINK_RESET); + } else { + netif_carrier_off(dev->net); + } +} + +static struct sk_buff *ch9200_tx_fixup(struct usbnet *dev, struct sk_buff *skb, + gfp_t flags) +{ + int i = 0; + int len = 0; + int tx_overhead = 0; + + tx_overhead = 0x40; + + len = skb->len; + if (skb_headroom(skb) < tx_overhead) { + struct sk_buff *skb2; + + skb2 = skb_copy_expand(skb, tx_overhead, 0, flags); + dev_kfree_skb_any(skb); + skb = skb2; + if (!skb) + return NULL; + } + + __skb_push(skb, tx_overhead); + /* usbnet adds padding if length is a multiple of packet size + * if so, adjust length value in header + */ + if ((skb->len % dev->maxpacket) == 0) + len++; + + skb->data[0] = len; + skb->data[1] = len >> 8; + skb->data[2] = 0x00; + skb->data[3] = 0x80; + + for (i = 4; i < 48; i++) + skb->data[i] = 0x00; + + skb->data[48] = len; + skb->data[49] = len >> 8; + skb->data[50] = 0x00; + skb->data[51] = 0x80; + + for (i = 52; i < 64; i++) + skb->data[i] = 0x00; + + return skb; +} + +static int ch9200_rx_fixup(struct usbnet *dev, struct sk_buff *skb) +{ + int len = 0; + int rx_overhead = 0; + + rx_overhead = 64; + + if (unlikely(skb->len < rx_overhead)) { + dev_err(&dev->udev->dev, "unexpected tiny rx frame\n"); + return 0; + } + + len = (skb->data[skb->len - 16] | skb->data[skb->len - 15] << 8); + skb_trim(skb, len); + + return 1; +} + +static int get_mac_address(struct usbnet *dev, unsigned char *data) +{ + int err = 0; + unsigned char mac_addr[0x06]; + int rd_mac_len = 0; + + netdev_dbg(dev->net, "get_mac_address:\n\tusbnet VID:%0x PID:%0x\n", + dev->udev->descriptor.idVendor, + dev->udev->descriptor.idProduct); + + memset(mac_addr, 0, sizeof(mac_addr)); + rd_mac_len = control_read(dev, REQUEST_READ, 0, + MAC_REG_STATION_L, mac_addr, 0x02, + CONTROL_TIMEOUT_MS); + rd_mac_len += control_read(dev, REQUEST_READ, 0, MAC_REG_STATION_M, + mac_addr + 2, 0x02, CONTROL_TIMEOUT_MS); + rd_mac_len += control_read(dev, REQUEST_READ, 0, MAC_REG_STATION_H, + mac_addr + 4, 0x02, CONTROL_TIMEOUT_MS); + if (rd_mac_len != ETH_ALEN) + err = -EINVAL; + + data[0] = mac_addr[5]; + data[1] = mac_addr[4]; + data[2] = mac_addr[3]; + data[3] = mac_addr[2]; + data[4] = mac_addr[1]; + data[5] = mac_addr[0]; + + return err; +} + +static int ch9200_bind(struct usbnet *dev, struct usb_interface *intf) +{ + int retval = 0; + unsigned char data[2]; + + retval = usbnet_get_endpoints(dev, intf); + if (retval) + return retval; + + dev->mii.dev = dev->net; + dev->mii.mdio_read = ch9200_mdio_read; + dev->mii.mdio_write = ch9200_mdio_write; + dev->mii.reg_num_mask = 0x1f; + + dev->mii.phy_id_mask = 0x1f; + + dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len; + dev->rx_urb_size = 24 * 64 + 16; + mii_nway_restart(&dev->mii); + + data[0] = 0x01; + data[1] = 0x0F; + retval = control_write(dev, REQUEST_WRITE, 0, MAC_REG_THRESHOLD, data, + 0x02, CONTROL_TIMEOUT_MS); + + data[0] = 0xA0; + data[1] = 0x90; + retval = control_write(dev, REQUEST_WRITE, 0, MAC_REG_FIFO_DEPTH, data, + 0x02, CONTROL_TIMEOUT_MS); + + data[0] = 0x30; + data[1] = 0x00; + retval = control_write(dev, REQUEST_WRITE, 0, MAC_REG_PAUSE, data, + 0x02, CONTROL_TIMEOUT_MS); + + data[0] = 0x17; + data[1] = 0xD8; + retval = control_write(dev, REQUEST_WRITE, 0, MAC_REG_FLOW_CONTROL, + data, 0x02, CONTROL_TIMEOUT_MS); + + /* Undocumented register */ + data[0] = 0x01; + data[1] = 0x00; + retval = control_write(dev, REQUEST_WRITE, 0, 254, data, 0x02, + CONTROL_TIMEOUT_MS); + + data[0] = 0x5F; + data[1] = 0x0D; + retval = control_write(dev, REQUEST_WRITE, 0, MAC_REG_CTRL, data, 0x02, + CONTROL_TIMEOUT_MS); + + retval = get_mac_address(dev, dev->net->dev_addr); + + return retval; +} + +static const struct driver_info ch9200_info = { + .description = "CH9200 USB to Network Adaptor", + .flags = FLAG_ETHER, + .bind = ch9200_bind, + .rx_fixup = ch9200_rx_fixup, + .tx_fixup = ch9200_tx_fixup, + .status = ch9200_status, + .link_reset = ch9200_link_reset, + .reset = ch9200_link_reset, +}; + +static const struct usb_device_id ch9200_products[] = { + { + USB_DEVICE(0x1A86, 0xE092), + .driver_info = (unsigned long)&ch9200_info, + }, + {}, +}; + +MODULE_DEVICE_TABLE(usb, ch9200_products); + +static struct usb_driver ch9200_driver = { + .name = "ch9200", + .id_table = ch9200_products, + .probe = usbnet_probe, + .disconnect = usbnet_disconnect, + .suspend = usbnet_suspend, + .resume = usbnet_resume, +}; + +static int __init ch9200_init(void) +{ + return usb_register(&ch9200_driver); +} + +static void __exit ch9200_exit(void) +{ + usb_deregister(&ch9200_driver); +} + +module_init(ch9200_init); +module_exit(ch9200_exit); + +MODULE_DESCRIPTION("QinHeng CH9200 USB Network device"); +MODULE_LICENSE("GPL"); From 5eb8f289ac3020a9abad1c3c532ceca83284b6ed Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Fri, 18 Sep 2015 16:20:32 -0400 Subject: [PATCH 064/116] geneve: remove vlan-related feature assignment The code handling vlan tag insertion was dropped in commit 371bd1061d29 ("geneve: Consolidate Geneve functionality in single module."). Now we need to drop the related vlan feature bits in the netdev structure. Signed-off-by: John W. Linville Reviewed-by: Jesse Gross Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index da3259ce7c8d..8f1e9150341a 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -748,12 +748,8 @@ static void geneve_setup(struct net_device *dev) dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; - dev->vlan_features = dev->features; - dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; netif_keep_dst(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; From 4c5d283acc997a1bd7bc37cddcf7d284521cffff Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 18 Sep 2015 17:47:55 -0400 Subject: [PATCH 065/116] sunvnet: Invoke SET_NETDEV_DEV() to set up the vdev in vnet_new() `ls /sys/devices/channel-devices/vnet-port-0-0/net' is missing without this change, and applications like NetworkManager are looking in sysfs for the information. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunvnet.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index 53fe200e0b79..cc106d892e29 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -1756,7 +1756,8 @@ static const struct net_device_ops vnet_ops = { #endif }; -static struct vnet *vnet_new(const u64 *local_mac) +static struct vnet *vnet_new(const u64 *local_mac, + struct vio_dev *vdev) { struct net_device *dev; struct vnet *vp; @@ -1790,6 +1791,8 @@ static struct vnet *vnet_new(const u64 *local_mac) NETIF_F_HW_CSUM | NETIF_F_SG; dev->features = dev->hw_features; + SET_NETDEV_DEV(dev, &vdev->dev); + err = register_netdev(dev); if (err) { pr_err("Cannot register net device, aborting\n"); @@ -1808,7 +1811,8 @@ err_out_free_dev: return ERR_PTR(err); } -static struct vnet *vnet_find_or_create(const u64 *local_mac) +static struct vnet *vnet_find_or_create(const u64 *local_mac, + struct vio_dev *vdev) { struct vnet *iter, *vp; @@ -1821,7 +1825,7 @@ static struct vnet *vnet_find_or_create(const u64 *local_mac) } } if (!vp) - vp = vnet_new(local_mac); + vp = vnet_new(local_mac, vdev); mutex_unlock(&vnet_list_mutex); return vp; @@ -1848,7 +1852,8 @@ static void vnet_cleanup(void) static const char *local_mac_prop = "local-mac-address"; static struct vnet *vnet_find_parent(struct mdesc_handle *hp, - u64 port_node) + u64 port_node, + struct vio_dev *vdev) { const u64 *local_mac = NULL; u64 a; @@ -1869,7 +1874,7 @@ static struct vnet *vnet_find_parent(struct mdesc_handle *hp, if (!local_mac) return ERR_PTR(-ENODEV); - return vnet_find_or_create(local_mac); + return vnet_find_or_create(local_mac, vdev); } static struct ldc_channel_config vnet_ldc_cfg = { @@ -1923,7 +1928,7 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) hp = mdesc_grab(); - vp = vnet_find_parent(hp, vdev->mp); + vp = vnet_find_parent(hp, vdev->mp, vdev); if (IS_ERR(vp)) { pr_err("Cannot find port parent vnet\n"); err = PTR_ERR(vp); From ed2e923945892a8372ab70d2f61d364b0b6d9054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:08:34 -0700 Subject: [PATCH 066/116] tcp/dccp: fix timewait races in timer handling When creating a timewait socket, we need to arm the timer before allowing other cpus to find it. The signal allowing cpus to find the socket is setting tw_refcnt to non zero value. As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to call inet_twsk_schedule() first. This also means we need to remove tw_refcnt changes from inet_twsk_schedule() and let the caller handle it. Note that because we use mod_timer_pinned(), we have the guarantee the timer wont expire before we set tw_refcnt as we run in BH context. To make things more readable I introduced inet_twsk_reschedule() helper. When rearming the timer, we can use mod_timer_pending() to make sure we do not rearm a canceled timer. Note: This bug can possibly trigger if packets of a flow can hit multiple cpus. This does not normally happen, unless flow steering is broken somehow. This explains this bug was spotted ~5 months after its introduction. A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(), but will be provided in a separate patch for proper tracking. Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Reported-by: Ying Cai Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 14 +++++++++++++- net/dccp/minisocks.c | 4 ++-- net/ipv4/inet_timewait_sock.c | 16 ++++++++++------ net/ipv4/tcp_minisocks.c | 13 ++++++------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 879d6e5a973b..186f3a1e1b1f 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,7 +110,19 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, + bool rearm); + +static void inline inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + +static void inline inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, true); +} + void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee2dd03..838f524cf11a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc24fbe8..c67f9bd7699c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b066ac..def765911ff8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -201,7 +201,7 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this From 29c6852602e259d2c1882f320b29d5c3fec0de04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:48:04 -0700 Subject: [PATCH 067/116] inet: fix races in reqsk_queue_hash_req() Before allowing lockless LISTEN processing, we need to make sure to arm the SYN_RECV timer before the req socket is visible in hash tables. Also, req->rsk_hash should be written before we set rsk_refcnt to a non zero value. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Cc: Ying Cai Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 134957159c27..7bb9c39e0a4d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -685,20 +685,20 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + req->rsk_hash = hash; + /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; spin_lock(&queue->syn_wait_lock); req->dl_next = lopt->syn_table[hash]; lopt->syn_table[hash] = req; spin_unlock(&queue->syn_wait_lock); - - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } EXPORT_SYMBOL(reqsk_queue_hash_req); From 2df1b131b54f431877a6665139dac805ba5ce1a5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Aug 2015 14:07:13 +0200 Subject: [PATCH 068/116] mac80211: fix VHT MCS mask array overrun The HT MCS mask has 9 bytes, the VHT one only has 8 streams. Split the loops to handle this correctly. Reported-by: Dan Carpenter Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 17b1fe961c5d..a30ec3ce3d25 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2518,15 +2518,17 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) + if (~sdata->rc_rateidx_mcs_mask[i][j]) { sdata->rc_has_mcs_mask[i] = true; - - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) - sdata->rc_has_vht_mcs_mask[i] = true; - - if (sdata->rc_has_mcs_mask[i] && - sdata->rc_has_vht_mcs_mask[i]) break; + } + } + + for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { + sdata->rc_has_vht_mcs_mask[i] = true; + break; + } } } From babc305e21ea3811d98b67437299360904ac1b6a Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 21 Sep 2015 15:47:40 +0300 Subject: [PATCH 069/116] mac80211: reset CQM history upon reconfiguration The current behavior of notifying CQM events is inconsistent: Upon first configuration there is a cqm event with the current status according to threshold configured, regardless of signal stability. When there is reconfiguration no event is sent unless there is a significant change to the signal level according to the new configuration. Since the current reconfiguration behavior might cause missing CQM events in case the current signal did not change but is on the other side of the new threshold, fix that by resetting the stored signal level upon reconfiguration. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a30ec3ce3d25..7a77a1470f25 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2474,6 +2474,7 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && From 08399efc631960c827cebcfe83ad7ed54ebc012b Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Mon, 21 Sep 2015 10:29:09 -0400 Subject: [PATCH 070/116] geneve: ensure ECN info is handled properly in all tx/rx paths Partially due to a pre-exising "thinko", the new metadata-based tx/rx paths were handling ECN propagation differently than the traditional tx/rx paths. This patch removes the "thinko" (involving multiple ip_hdr assignments) on the rx path and corrects the ECN handling on both the rx and tx paths. Signed-off-by: John W. Linville Reviewed-by: Jesse Gross Signed-off-by: David S. Miller --- drivers/net/geneve.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 8f1e9150341a..4ce8b7b90c7c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -126,6 +126,8 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) __be32 addr; int err; + iph = ip_hdr(skb); /* outer IP header... */ + if (gs->collect_md) { static u8 zero_vni[3]; @@ -133,7 +135,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) addr = 0; } else { vni = gnvh->vni; - iph = ip_hdr(skb); /* Still outer IP header... */ addr = iph->saddr; } @@ -178,7 +179,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) skb_reset_network_header(skb); - iph = ip_hdr(skb); /* Now inner IP header... */ err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -626,6 +626,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) struct geneve_sock *gs = geneve->sock; struct ip_tunnel_info *info = NULL; struct rtable *rt = NULL; + const struct iphdr *iip; /* interior IP header */ struct flowi4 fl4; __u8 tos, ttl; __be16 sport; @@ -653,6 +654,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); skb_reset_mac_header(skb); + iip = ip_hdr(skb); + if (info) { const struct ip_tunnel_key *key = &info->key; u8 *opts = NULL; @@ -668,19 +671,16 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(err)) goto err; - tos = key->tos; + tos = ip_tunnel_ecn_encap(key->tos, iip, skb); ttl = key->ttl; df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; } else { - const struct iphdr *iip; /* interior IP header */ - udp_csum = false; err = geneve_build_skb(rt, skb, 0, geneve->vni, 0, NULL, udp_csum); if (unlikely(err)) goto err; - iip = ip_hdr(skb); tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb); ttl = geneve->ttl; if (!ttl && IN_MULTICAST(ntohl(fl4.daddr))) From 53adc9e83028d9e35b6408231ebaf62a94a16e4d Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 21 Sep 2015 21:42:59 +0100 Subject: [PATCH 071/116] net: dsa: actually force the speed on the CPU port Commit 54d792f257c6 ("net: dsa: Centralise global and port setup code into mv88e6xxx.") merged in the 4.2 merge window broke the link speed forcing for the CPU port of Marvell DSA switches. The original code was: /* MAC Forcing register: don't force link, speed, duplex * or flow control state to any particular values on physical * ports, but force the CPU port and all DSA ports to 1000 Mb/s * full duplex. */ if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) REG_WRITE(addr, 0x01, 0x003e); else REG_WRITE(addr, 0x01, 0x0003); but the new code does a read-modify-write: reg = _mv88e6xxx_reg_read(ds, REG_PORT(port), PORT_PCS_CTRL); if (dsa_is_cpu_port(ds, port) || ds->dsa_port_mask & (1 << port)) { reg |= PORT_PCS_CTRL_FORCE_LINK | PORT_PCS_CTRL_LINK_UP | PORT_PCS_CTRL_DUPLEX_FULL | PORT_PCS_CTRL_FORCE_DUPLEX; if (mv88e6xxx_6065_family(ds)) reg |= PORT_PCS_CTRL_100; else reg |= PORT_PCS_CTRL_1000; The link speed in the PCS control register is a two bit field. Forcing the link speed in this way doesn't ensure that the bit field is set to the correct value - on the hardware I have here, the speed bitfield remains set to 0x03, resulting in the speed not being forced to gigabit. We must clear both bits before forcing the link speed. Fixes: 54d792f257c6 ("net: dsa: Centralise global and port setup code into mv88e6xxx.") Signed-off-by: Russell King Acked-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 6f13f7206762..f8baa897d1a0 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -2000,6 +2000,7 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port) */ reg = _mv88e6xxx_reg_read(ds, REG_PORT(port), PORT_PCS_CTRL); if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) { + reg &= ~PORT_PCS_CTRL_UNFORCED; reg |= PORT_PCS_CTRL_FORCE_LINK | PORT_PCS_CTRL_LINK_UP | PORT_PCS_CTRL_DUPLEX_FULL | From ae5f2fb1d51fa128a460bcfbe3c56d7ab8bf6a43 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 21 Sep 2015 20:21:20 -0700 Subject: [PATCH 072/116] openvswitch: Zero flows on allocation. When support for megaflows was introduced, OVS needed to start installing flows with a mask applied to them. Since masking is an expensive operation, OVS also had an optimization that would only take the parts of the flow keys that were covered by a non-zero mask. The values stored in the remaining pieces should not matter because they are masked out. While this works fine for the purposes of matching (which must always look at the mask), serialization to netlink can be problematic. Since the flow and the mask are serialized separately, the uninitialized portions of the flow can be encoded with whatever values happen to be present. In terms of functionality, this has little effect since these fields will be masked out by definition. However, it leaks kernel memory to userspace, which is a potential security vulnerability. It is also possible that other code paths could look at the masked key and get uninitialized data, although this does not currently appear to be an issue in practice. This removes the mask optimization for flows that are being installed. This was always intended to be the case as the mask optimizations were really targetting per-packet flow operations. Fixes: 03f0d916 ("openvswitch: Mega flow implementation") Signed-off-by: Jesse Gross Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/flow_table.c | 23 ++++++++++++----------- net/openvswitch/flow_table.h | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6fbd2decb19e..b816ff871528 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -952,7 +952,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, &mask); + ovs_flow_mask_key(&new_flow->key, &key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], @@ -1080,7 +1080,7 @@ static struct sw_flow_actions *get_flow_actions(struct net *net, struct sw_flow_key masked_key; int error; - ovs_flow_mask_key(&masked_key, key, mask); + ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d22d8e948d0f..f2ea83ba4763 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -57,20 +57,21 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range) } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) + bool full, const struct sw_flow_mask *mask) { - const long *m = (const long *)((const u8 *)&mask->key + - mask->range.start); - const long *s = (const long *)((const u8 *)src + - mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); int i; - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } @@ -475,7 +476,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, u32 hash; struct sw_flow_key masked_key; - ovs_flow_mask_key(&masked_key, unmasked, mask); + ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 616eda10d955..2dd9900f533d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -86,5 +86,5 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); + bool full, const struct sw_flow_mask *mask); #endif /* flow_table.h */ From 23eedbc2435ddd226717603c4f3c8efec7bdbb4d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 22 Sep 2015 09:29:49 +0200 Subject: [PATCH 073/116] ch9200: Convert to use module_usb_driver Converts the ch9200 driver to use the module_usb_driver() macro which makes the code smaller and a bit simpler. Signed-off-by: Tobias Klauser Acked-by: Matthew Garrett Signed-off-by: David S. Miller --- drivers/net/usb/ch9200.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/net/usb/ch9200.c b/drivers/net/usb/ch9200.c index cabb670eb55c..5e151e6a3e09 100644 --- a/drivers/net/usb/ch9200.c +++ b/drivers/net/usb/ch9200.c @@ -426,18 +426,7 @@ static struct usb_driver ch9200_driver = { .resume = usbnet_resume, }; -static int __init ch9200_init(void) -{ - return usb_register(&ch9200_driver); -} - -static void __exit ch9200_exit(void) -{ - usb_deregister(&ch9200_driver); -} - -module_init(ch9200_init); -module_exit(ch9200_exit); +module_usb_driver(ch9200_driver); MODULE_DESCRIPTION("QinHeng CH9200 USB Network device"); MODULE_LICENSE("GPL"); From 7def0f952eccdd0edb3c504f4dab35ee0d3aba1f Mon Sep 17 00:00:00 2001 From: Dmitriy Vyukov Date: Tue, 22 Sep 2015 10:51:52 +0200 Subject: [PATCH 074/116] lib: fix data race in rhashtable_rehash_one rhashtable_rehash_one() uses complex logic to update entry->next field, after INIT_RHT_NULLS_HEAD and NULLS_MARKER expansion: entry->next = 1 | ((base + off) << 1) This can be compiled along the lines of: entry->next = base + off entry->next <<= 1 entry->next |= 1 Which will break concurrent readers. NULLS value recomputation is not needed here, so just remove the complex logic. The data race was found with KernelThreadSanitizer (KTSAN). Signed-off-by: Dmitry Vyukov Acked-by: Eric Dumazet Acked-by: Thomas Graf Acked-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index cc0c69710dcf..a54ff8949f91 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -187,10 +187,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) head = rht_dereference_bucket(new_tbl->buckets[new_hash], new_tbl, new_hash); - if (rht_is_a_nulls(head)) - INIT_RHT_NULLS_HEAD(entry->next, ht, new_hash); - else - RCU_INIT_POINTER(entry->next, head); + RCU_INIT_POINTER(entry->next, head); rcu_assign_pointer(new_tbl->buckets[new_hash], entry); spin_unlock(new_bucket_lock); From fbd03513bf36c4e5c2942f436f05c8eb99a3cc9e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 22 Sep 2015 11:28:14 +0200 Subject: [PATCH 075/116] net: dsa: Fix Marvell Egress Trailer check The Marvell Egress rx trailer check must be fixed to correctly detect bad bits in the third byte of the Eggress trailer as described in the Table 28 of the 88E6060 datasheet. The current code incorrectly omits to check the third byte and checks the fourth byte twice. Signed-off-by: Neil Armstrong Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- net/dsa/tag_trailer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d25efc93d8f1..b6ca0890d018 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -78,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || - (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) goto out_drop; source_port = trailer[1] & 7; From 675ee231d960af2af3606b4480324e26797eb010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Sep 2015 14:00:21 -0700 Subject: [PATCH 076/116] tcp: add proper TS val into RST packets RST packets sent on behalf of TCP connections with TS option (RFC 7323 TCP timestamps) have incorrect TS val (set to 0), but correct TS ecr. A > B: Flags [S], seq 0, win 65535, options [mss 1000,nop,nop,TS val 100 ecr 0], length 0 B > A: Flags [S.], seq 2444755794, ack 1, win 28960, options [mss 1460,nop,nop,TS val 7264344 ecr 100], length 0 A > B: Flags [.], ack 1, win 65535, options [nop,nop,TS val 110 ecr 7264344], length 0 B > A: Flags [R.], seq 1, ack 1, win 28960, options [nop,nop,TS val 0 ecr 110], length 0 We need to call skb_mstamp_get() to get proper TS val, derived from skb->skb_mstamp Note that RFC 1323 was advocating to not send TS option in RST segment, but RFC 7323 recommends the opposite : Once TSopt has been successfully negotiated, that is both and contain TSopt, the TSopt MUST be sent in every non- segment for the duration of the connection, and SHOULD be sent in an segment (see Section 5.2 for details) Note this RFC recommends to send TS val = 0, but we believe it is premature : We do not know if all TCP stacks are properly handling the receive side : When an segment is received, it MUST NOT be subjected to the PAWS check by verifying an acceptable value in SEG.TSval, and information from the Timestamps option MUST NOT be used to update connection state information. SEG.TSecr MAY be used to provide stricter acceptance checks. In 5 years, if/when all TCP stack are RFC 7323 ready, we might consider to decide to send TS val = 0, if it buys something. Fixes: 7faee5c0d514 ("tcp: remove TCP_SKB_CB(skb)->when") Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12b62ee..1100ffe4a722 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2897,6 +2897,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); + skb_mstamp_get(&skb->skb_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); From 2d8bff12699abc3a9bf886bb0b79f44d94d81496 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 23 Sep 2015 14:57:58 -0400 Subject: [PATCH 077/116] netpoll: Close race condition between poll_one_napi and napi_disable Drivers might call napi_disable while not holding the napi instance poll_lock. In those instances, its possible for a race condition to exist between poll_one_napi and napi_disable. That is to say, poll_one_napi only tests the NAPI_STATE_SCHED bit to see if there is work to do during a poll, and as such the following may happen: CPU0 CPU1 ndo_tx_timeout napi_poll_dev napi_disable poll_one_napi test_and_set_bit (ret 0) test_bit (ret 1) reset adapter napi_poll_routine If the adapter gets a tx timeout without a napi instance scheduled, its possible for the adapter to think it has exclusive access to the hardware (as the napi instance is now scheduled via the napi_disable call), while the netpoll code thinks there is simply work to do. The result is parallel hardware access leading to corrupt data structures in the driver, and a crash. Additionaly, there is another, more critical race between netpoll and napi_disable. The disabled napi state is actually identical to the scheduled state for a given napi instance. The implication being that, if a napi instance is disabled, a netconsole instance would see the napi state of the device as having been scheduled, and poll it, likely while the driver was dong something requiring exclusive access. In the case above, its fairly clear that not having the rings in a state ready to be polled will cause any number of crashes. The fix should be pretty easy. netpoll uses its own bit to indicate that that the napi instance is in a state of being serviced by netpoll (NAPI_STATE_NPSVC). We can just gate disabling on that bit as well as the sched bit. That should prevent netpoll from conducting a napi poll if we convert its set bit to a test_and_set_bit operation to provide mutual exclusion Change notes: V2) Remove a trailing whtiespace Resubmit with proper subject prefix V3) Clean up spacing nits Signed-off-by: Neil Horman CC: "David S. Miller" CC: jmaxwell@redhat.com Tested-by: jmaxwell@redhat.com Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 2 ++ net/core/netpoll.c | 10 ++++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88a00694eda5..2d15e3831440 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -507,6 +507,7 @@ static inline void napi_enable(struct napi_struct *n) BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); } #ifdef CONFIG_SMP diff --git a/net/core/dev.c b/net/core/dev.c index 877c84834d81..6bb6470f5b7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4713,6 +4713,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6aa3db8dfc3b..8bdada242a7d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -142,7 +142,7 @@ static void queue_process(struct work_struct *work) */ static int poll_one_napi(struct napi_struct *napi, int budget) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while @@ -151,7 +151,12 @@ static int poll_one_napi(struct napi_struct *napi, int budget) if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + goto out; work = napi->poll(napi, budget); WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); @@ -159,6 +164,7 @@ static int poll_one_napi(struct napi_struct *napi, int budget) clear_bit(NAPI_STATE_NPSVC, &napi->state); +out: return budget - work; } From d3869efe7a8a2298516d9af4f91487cf486ca945 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 19:45:08 +0100 Subject: [PATCH 078/116] Fix AF_PACKET ABI breakage in 4.2 Commit 7d82410950aa ("virtio: add explicit big-endian support to memory accessors") accidentally changed the virtio_net header used by AF_PACKET with PACKET_VNET_HDR from host-endian to big-endian. Since virtio_legacy_is_little_endian() is a very long identifier, define a vio_le macro and use that throughout the code instead of the hard-coded 'false' for little-endian. This restores the ABI to match 4.1 and earlier kernels, and makes my test program work again. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- net/packet/af_packet.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b8e39a22387..aa4b15c35884 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -230,6 +230,8 @@ struct packet_skb_cb { } sa; }; +#define vio_le() virtio_legacy_is_little_endian() + #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -2680,15 +2682,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(false, vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(false, - __virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); + (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) + vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); err = -EINVAL; - if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) + if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2731,7 +2733,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(false, vnet_hdr.hdr_len), + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2778,8 +2780,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); if (!skb_partial_csum_set(skb, s, o)) { err = -EINVAL; goto out_free; @@ -2787,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(false, vnet_hdr.gso_size); + __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ @@ -3161,9 +3163,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = - __cpu_to_virtio16(false, skb_headlen(skb)); + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); vnet_hdr.gso_size = - __cpu_to_virtio16(false, sinfo->gso_size); + __cpu_to_virtio16(vio_le(), sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -3181,9 +3183,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(false, + vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(false, + vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), skb->csum_offset); } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; From 8c85151ddec66f78fbf997e35be005a322fbb9c9 Mon Sep 17 00:00:00 2001 From: WingMan Kwok Date: Wed, 23 Sep 2015 13:37:05 -0400 Subject: [PATCH 079/116] net: netcp: ethss: fix error in calling sgmii api with incorrect offset On K2HK, sgmii module registers of slave 0 and 1 are mem mapped to one contiguous block, while those of slave 2 and 3 are mapped to another contiguous block. However, on K2E and K2L, sgmii module registers of all slaves are mem mapped to one contiguous block. SGMII APIs expect slave 0 sgmii base when API is invoked for slave 0 and 1, and slave 2 sgmii base when invoked for other slaves. Before this patch, slave 0 sgmii base is always passed to sgmii API for K2E regardless which slave is the API invoked for. This patch fixes the problem. Signed-off-by: WingMan Kwok Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_ethss.c | 47 ++++++++++++--------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index 6f16d6aaf7b7..6bff8d82ceab 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -77,6 +77,7 @@ #define GBENU_ALE_OFFSET 0x1e000 #define GBENU_HOST_PORT_NUM 0 #define GBENU_NUM_ALE_ENTRIES 1024 +#define GBENU_SGMII_MODULE_SIZE 0x100 /* 10G Ethernet SS defines */ #define XGBE_MODULE_NAME "netcp-xgbe" @@ -149,8 +150,8 @@ #define XGBE_STATS2_MODULE 2 /* s: 0-based slave_port */ -#define SGMII_BASE(s) \ - (((s) < 2) ? gbe_dev->sgmii_port_regs : gbe_dev->sgmii_port34_regs) +#define SGMII_BASE(d, s) \ + (((s) < 2) ? (d)->sgmii_port_regs : (d)->sgmii_port34_regs) #define GBE_TX_QUEUE 648 #define GBE_TXHOOK_ORDER 0 @@ -1997,13 +1998,8 @@ static void netcp_ethss_update_link_state(struct gbe_priv *gbe_dev, return; if (!SLAVE_LINK_IS_XGMII(slave)) { - if (gbe_dev->ss_version == GBE_SS_VERSION_14) - sgmii_link_state = - netcp_sgmii_get_port_link(SGMII_BASE(sp), sp); - else - sgmii_link_state = - netcp_sgmii_get_port_link( - gbe_dev->sgmii_port_regs, sp); + sgmii_link_state = + netcp_sgmii_get_port_link(SGMII_BASE(gbe_dev, sp), sp); } phy_link_state = gbe_phy_link_status(slave); @@ -2100,17 +2096,11 @@ static void gbe_port_config(struct gbe_priv *gbe_dev, struct gbe_slave *slave, static void gbe_sgmii_rtreset(struct gbe_priv *priv, struct gbe_slave *slave, bool set) { - void __iomem *sgmii_port_regs; - if (SLAVE_LINK_IS_XGMII(slave)) return; - if ((priv->ss_version == GBE_SS_VERSION_14) && (slave->slave_num >= 2)) - sgmii_port_regs = priv->sgmii_port34_regs; - else - sgmii_port_regs = priv->sgmii_port_regs; - - netcp_sgmii_rtreset(sgmii_port_regs, slave->slave_num, set); + netcp_sgmii_rtreset(SGMII_BASE(priv, slave->slave_num), + slave->slave_num, set); } static void gbe_slave_stop(struct gbe_intf *intf) @@ -2136,17 +2126,12 @@ static void gbe_slave_stop(struct gbe_intf *intf) static void gbe_sgmii_config(struct gbe_priv *priv, struct gbe_slave *slave) { - void __iomem *sgmii_port_regs; + if (SLAVE_LINK_IS_XGMII(slave)) + return; - sgmii_port_regs = priv->sgmii_port_regs; - if ((priv->ss_version == GBE_SS_VERSION_14) && (slave->slave_num >= 2)) - sgmii_port_regs = priv->sgmii_port34_regs; - - if (!SLAVE_LINK_IS_XGMII(slave)) { - netcp_sgmii_reset(sgmii_port_regs, slave->slave_num); - netcp_sgmii_config(sgmii_port_regs, slave->slave_num, - slave->link_interface); - } + netcp_sgmii_reset(SGMII_BASE(priv, slave->slave_num), slave->slave_num); + netcp_sgmii_config(SGMII_BASE(priv, slave->slave_num), slave->slave_num, + slave->link_interface); } static int gbe_slave_open(struct gbe_intf *gbe_intf) @@ -2997,6 +2982,14 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev, gbe_dev->switch_regs = regs; gbe_dev->sgmii_port_regs = gbe_dev->ss_regs + GBENU_SGMII_MODULE_OFFSET; + + /* Although sgmii modules are mem mapped to one contiguous + * region on GBENU devices, setting sgmii_port34_regs allows + * consistent code when accessing sgmii api + */ + gbe_dev->sgmii_port34_regs = gbe_dev->sgmii_port_regs + + (2 * GBENU_SGMII_MODULE_SIZE); + gbe_dev->host_port_regs = gbe_dev->switch_regs + GBENU_HOST_PORT_OFFSET; for (i = 0; i < (gbe_dev->max_num_ports); i++) From 156e3c21f89655f099228577005a6c656b3ceb3d Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:06 -0400 Subject: [PATCH 080/116] net: netcp: remove dead code from the driver netcp_core is the first driver that will get initialized and the modules (ethss, pa etc) will then get initialized. So the code at the end of netcp_probe() that iterate over the modules is a dead code as the module list will be always be empty. So remove this code. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 1a5aca55ea9f..c0bc4b9658fe 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -2040,7 +2040,6 @@ static int netcp_probe(struct platform_device *pdev) struct device_node *child, *interfaces; struct netcp_device *netcp_device; struct device *dev = &pdev->dev; - struct netcp_module *module; int ret; if (!node) { @@ -2087,14 +2086,6 @@ static int netcp_probe(struct platform_device *pdev) /* Add the device instance to the list */ list_add_tail(&netcp_device->device_list, &netcp_devices); - /* Probe & attach any modules already registered */ - mutex_lock(&netcp_modules_lock); - for_each_netcp_module(module) { - ret = netcp_module_probe(netcp_device, module); - if (ret < 0) - dev_err(dev, "module(%s) probe failed\n", module->name); - } - mutex_unlock(&netcp_modules_lock); return 0; probe_quit_interface: From 736532a0705ffc27c14f712fa2758a7f8b15e8b4 Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:07 -0400 Subject: [PATCH 081/116] net: netcp: move netcp_register_interface() to after attach module The netcp interface is not fully initialized before attach the module to the interface. For example, the tx pipe/rx pipe is initialized in ethss module as part of attach(). So until this is complete, the interface can't be registered. So move registration of interface to net device outside the current loop that attaches the modules to the interface. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index c0bc4b9658fe..cf693dec2e44 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -291,13 +291,6 @@ static int netcp_module_probe(struct netcp_device *netcp_device, interface_list) { struct netcp_intf_modpriv *intf_modpriv; - /* If interface not registered then register now */ - if (!netcp_intf->netdev_registered) - ret = netcp_register_interface(netcp_intf); - - if (ret) - return -ENODEV; - intf_modpriv = devm_kzalloc(dev, sizeof(*intf_modpriv), GFP_KERNEL); if (!intf_modpriv) @@ -323,6 +316,18 @@ static int netcp_module_probe(struct netcp_device *netcp_device, continue; } } + + /* Now register the interface with netdev */ + list_for_each_entry(netcp_intf, + &netcp_device->interface_head, + interface_list) { + /* If interface not registered then register now */ + if (!netcp_intf->netdev_registered) { + ret = netcp_register_interface(netcp_intf); + if (ret) + return -ENODEV; + } + } return 0; } From e558b1fbf5f43da83f91a31e595a6d65e663b100 Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:08 -0400 Subject: [PATCH 082/116] net: netcp: add error check to netcp_allocate_rx_buf() Currently, if netcp_allocate_rx_buf() fails due no descriptors in the rx free descriptor queue, inside the netcp_rxpool_refill() function the iterative loop to fill buffers doesn't terminate right away. So modify the netcp_allocate_rx_buf() to return an error code and use it break the loop when there is error. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index cf693dec2e44..97e2629c10b9 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -801,7 +801,7 @@ static void netcp_rxpool_free(struct netcp_intf *netcp) netcp->rx_pool = NULL; } -static void netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) +static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) { struct knav_dma_desc *hwdesc; unsigned int buf_len, dma_sz; @@ -815,7 +815,7 @@ static void netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) hwdesc = knav_pool_desc_get(netcp->rx_pool); if (IS_ERR_OR_NULL(hwdesc)) { dev_dbg(netcp->ndev_dev, "out of rx pool desc\n"); - return; + return -ENOMEM; } if (likely(fdq == 0)) { @@ -867,25 +867,26 @@ static void netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) knav_pool_desc_map(netcp->rx_pool, hwdesc, sizeof(*hwdesc), &dma, &dma_sz); knav_queue_push(netcp->rx_fdq[fdq], dma, sizeof(*hwdesc), 0); - return; + return 0; fail: knav_pool_desc_put(netcp->rx_pool, hwdesc); + return -ENOMEM; } /* Refill Rx FDQ with descriptors & attached buffers */ static void netcp_rxpool_refill(struct netcp_intf *netcp) { u32 fdq_deficit[KNAV_DMA_FDQ_PER_CHAN] = {0}; - int i; + int i, ret = 0; /* Calculate the FDQ deficit and refill */ for (i = 0; i < KNAV_DMA_FDQ_PER_CHAN && netcp->rx_fdq[i]; i++) { fdq_deficit[i] = netcp->rx_queue_depths[i] - knav_queue_get_count(netcp->rx_fdq[i]); - while (fdq_deficit[i]--) - netcp_allocate_rx_buf(netcp, i); + while (fdq_deficit[i]-- && !ret) + ret = netcp_allocate_rx_buf(netcp, i); } /* end for fdqs */ } From 915c5857874fc211874de1363e88f902e581e6eb Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:09 -0400 Subject: [PATCH 083/116] net: netcp: check for interface handle in netcp_module_probe() Currently netcp_module_probe() doesn't check the return value of of_parse_phandle() that points to the interface data for the module and then pass the node ptr to the module which is incorrect. Check for return value and free the intf_modpriv if there is error. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 97e2629c10b9..d39dce3b37a7 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -299,6 +299,11 @@ static int netcp_module_probe(struct netcp_device *netcp_device, interface = of_parse_phandle(netcp_intf->node_interface, module->name, 0); + if (!interface) { + devm_kfree(dev, intf_modpriv); + continue; + } + intf_modpriv->netcp_priv = netcp_intf; intf_modpriv->netcp_module = module; list_add_tail(&intf_modpriv->intf_list, From 99f8ef5dc6546ac28cc7a03ff8301bc72fe5527e Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:10 -0400 Subject: [PATCH 084/116] net: netcp: allocate buffers to desc before re-enable interrupt Currently netcp_rxpool_refill() that refill descriptors and attached buffers to fdq while interrupt is enabled as part of NAPI poll. Doing it while interrupt is disabled could be beneficial as hardware will not be starved when CPU is busy with processing interrupt. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index d39dce3b37a7..8026daaf940b 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -904,12 +904,12 @@ static int netcp_rx_poll(struct napi_struct *napi, int budget) packets = netcp_process_rx_packets(netcp, budget); + netcp_rxpool_refill(netcp); if (packets < budget) { napi_complete(&netcp->rx_napi); knav_queue_enable_notify(netcp->rx_queue); } - netcp_rxpool_refill(netcp); return packets; } From 8ceaf361ffd131e835aef1e6cdb1d5ba70702617 Mon Sep 17 00:00:00 2001 From: "Karicheri, Muralidharan" Date: Wed, 23 Sep 2015 13:37:11 -0400 Subject: [PATCH 085/116] net: netcp: fix deadlock reported by lockup detector A deadlock trace is seen in netcp driver with lockup detector enabled. The trace log is provided below for reference. This patch fixes the bug by removing the usage of netcp_modules_lock within ndo_ops functions. ndo_{open/close/ioctl)() is already called with rtnl_lock held. So there is no need to hold another mutex for serialization across processes on multiple cores. So remove use of netcp_modules_lock mutex from these ndo ops functions. ndo_set_rx_mode() shouldn't be using a mutex as it is called from atomic context. In the case of ndo_set_rx_mode(), there can be call to this API without rtnl_lock held from an atomic context. As the underlying modules are expected to add address to a hardware table, it is to be protected across concurrent updates and hence a spin lock is used to synchronize the access. Same with ndo_vlan_rx_add_vid() & ndo_vlan_rx_kill_vid(). Probably the netcp_modules_lock is used to protect the module not being removed as part of rmmod. Currently this is not fully implemented and assumes the interface is brought down before doing rmmod of modules. The support for rmmmod while interface is up is expected in a future patch set when additional modules such as pa, qos are added. For now all of the tests such as if up/down, reboot, iperf works fine with this patch applied. Deadlock trace seen with lockup detector enabled is shown below for reference. [ 16.863014] ====================================================== [ 16.869183] [ INFO: possible circular locking dependency detected ] [ 16.875441] 4.1.6-01265-gfb1e101 #1 Tainted: G W [ 16.881176] ------------------------------------------------------- [ 16.887432] ifconfig/1662 is trying to acquire lock: [ 16.892386] (netcp_modules_lock){+.+.+.}, at: [] netcp_ndo_open+0x168/0x518 [ 16.900321] [ 16.900321] but task is already holding lock: [ 16.906144] (rtnl_mutex){+.+.+.}, at: [] devinet_ioctl+0xf8/0x7e4 [ 16.913206] [ 16.913206] which lock already depends on the new lock. [ 16.913206] [ 16.921372] [ 16.921372] the existing dependency chain (in reverse order) is: [ 16.928844] -> #1 (rtnl_mutex){+.+.+.}: [ 16.932865] [] mutex_lock_nested+0x68/0x4a8 [ 16.938521] [] register_netdev+0xc/0x24 [ 16.943831] [] netcp_module_probe+0x214/0x2ec [ 16.949660] [] netcp_register_module+0xd4/0x140 [ 16.955663] [] keystone_gbe_init+0x10/0x28 [ 16.961233] [] do_one_initcall+0xb8/0x1f8 [ 16.966714] [] kernel_init_freeable+0x148/0x1e8 [ 16.972720] [] kernel_init+0xc/0xe8 [ 16.977682] [] ret_from_fork+0x14/0x3c [ 16.982905] -> #0 (netcp_modules_lock){+.+.+.}: [ 16.987619] [] lock_acquire+0x118/0x320 [ 16.992928] [] mutex_lock_nested+0x68/0x4a8 [ 16.998582] [] netcp_ndo_open+0x168/0x518 [ 17.004064] [] __dev_open+0xa8/0x10c [ 17.009112] [] __dev_change_flags+0x94/0x144 [ 17.014853] [] dev_change_flags+0x18/0x48 [ 17.020334] [] devinet_ioctl+0x6dc/0x7e4 [ 17.025729] [] sock_ioctl+0x1d0/0x2a8 [ 17.030865] [] do_vfs_ioctl+0x41c/0x688 [ 17.036173] [] SyS_ioctl+0x34/0x5c [ 17.041046] [] ret_fast_syscall+0x0/0x54 [ 17.046441] [ 17.046441] other info that might help us debug this: [ 17.046441] [ 17.054434] Possible unsafe locking scenario: [ 17.054434] [ 17.060343] CPU0 CPU1 [ 17.064862] ---- ---- [ 17.069381] lock(rtnl_mutex); [ 17.072522] lock(netcp_modules_lock); [ 17.078875] lock(rtnl_mutex); [ 17.084532] lock(netcp_modules_lock); [ 17.088366] [ 17.088366] *** DEADLOCK *** [ 17.088366] [ 17.094279] 1 lock held by ifconfig/1662: [ 17.098278] #0: (rtnl_mutex){+.+.+.}, at: [] devinet_ioctl+0xf8/0x7e4 [ 17.105774] [ 17.105774] stack backtrace: [ 17.110124] CPU: 1 PID: 1662 Comm: ifconfig Tainted: G W 4.1.6-01265-gfb1e101 #1 [ 17.118637] Hardware name: Keystone [ 17.122123] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 17.129862] [] (show_stack) from [] (dump_stack+0x84/0xc4) [ 17.137079] [] (dump_stack) from [] (print_circular_bug+0x210/0x330) [ 17.145161] [] (print_circular_bug) from [] (validate_chain.isra.35+0xf98/0x13ac) [ 17.154372] [] (validate_chain.isra.35) from [] (__lock_acquire+0x52c/0xcc0) [ 17.163149] [] (__lock_acquire) from [] (lock_acquire+0x118/0x320) [ 17.171058] [] (lock_acquire) from [] (mutex_lock_nested+0x68/0x4a8) [ 17.179140] [] (mutex_lock_nested) from [] (netcp_ndo_open+0x168/0x518) [ 17.187484] [] (netcp_ndo_open) from [] (__dev_open+0xa8/0x10c) [ 17.195133] [] (__dev_open) from [] (__dev_change_flags+0x94/0x144) [ 17.203129] [] (__dev_change_flags) from [] (dev_change_flags+0x18/0x48) [ 17.211560] [] (dev_change_flags) from [] (devinet_ioctl+0x6dc/0x7e4) [ 17.219729] [] (devinet_ioctl) from [] (sock_ioctl+0x1d0/0x2a8) [ 17.227378] [] (sock_ioctl) from [] (do_vfs_ioctl+0x41c/0x688) [ 17.234939] [] (do_vfs_ioctl) from [] (SyS_ioctl+0x34/0x5c) [ 17.242242] [] (SyS_ioctl) from [] (ret_fast_syscall+0x0/0x54) [ 17.258855] netcp-1.0 2620110.netcp eth0: Link is Up - 1Gbps/Full - flow control off [ 17.271282] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:616 [ 17.279712] in_atomic(): 1, irqs_disabled(): 0, pid: 1662, name: ifconfig [ 17.286500] INFO: lockdep is turned off. [ 17.290413] Preemption disabled at:[< (null)>] (null) [ 17.295728] [ 17.297214] CPU: 1 PID: 1662 Comm: ifconfig Tainted: G W 4.1.6-01265-gfb1e101 #1 [ 17.305735] Hardware name: Keystone [ 17.309223] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 17.316970] [] (show_stack) from [] (dump_stack+0x84/0xc4) [ 17.324194] [] (dump_stack) from [] (mutex_lock_nested+0x28/0x4a8) [ 17.332112] [] (mutex_lock_nested) from [] (netcp_set_rx_mode+0x160/0x210) [ 17.340724] [] (netcp_set_rx_mode) from [] (dev_set_rx_mode+0x1c/0x28) [ 17.348982] [] (dev_set_rx_mode) from [] (__dev_open+0xc4/0x10c) [ 17.356724] [] (__dev_open) from [] (__dev_change_flags+0x94/0x144) [ 17.364729] [] (__dev_change_flags) from [] (dev_change_flags+0x18/0x48) [ 17.373166] [] (dev_change_flags) from [] (devinet_ioctl+0x6dc/0x7e4) [ 17.381344] [] (devinet_ioctl) from [] (sock_ioctl+0x1d0/0x2a8) [ 17.388994] [] (sock_ioctl) from [] (do_vfs_ioctl+0x41c/0x688) [ 17.396563] [] (do_vfs_ioctl) from [] (SyS_ioctl+0x34/0x5c) [ 17.403873] [] (SyS_ioctl) from [] (ret_fast_syscall+0x0/0x54) [ 17.413772] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready udhcpc (v1.20.2) started Sending discover... [ 18.690666] netcp-1.0 2620110.netcp eth0: Link is Up - 1Gbps/Full - flow control off Sending discover... [ 22.250972] netcp-1.0 2620110.netcp eth0: Link is Up - 1Gbps/Full - flow control off [ 22.258721] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready [ 22.265458] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:616 [ 22.273896] in_atomic(): 1, irqs_disabled(): 0, pid: 342, name: kworker/1:1 [ 22.280854] INFO: lockdep is turned off. [ 22.284767] Preemption disabled at:[< (null)>] (null) [ 22.290074] [ 22.291568] CPU: 1 PID: 342 Comm: kworker/1:1 Tainted: G W 4.1.6-01265-gfb1e101 #1 [ 22.300255] Hardware name: Keystone [ 22.303750] Workqueue: ipv6_addrconf addrconf_dad_work [ 22.308895] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 22.316643] [] (show_stack) from [] (dump_stack+0x84/0xc4) [ 22.323867] [] (dump_stack) from [] (mutex_lock_nested+0x28/0x4a8) [ 22.331786] [] (mutex_lock_nested) from [] (netcp_set_rx_mode+0x160/0x210) [ 22.340394] [] (netcp_set_rx_mode) from [] (__dev_mc_add+0x54/0x68) [ 22.348401] [] (__dev_mc_add) from [] (igmp6_group_added+0x168/0x1b4) [ 22.356580] [] (igmp6_group_added) from [] (ipv6_dev_mc_inc+0x4f0/0x5a8) [ 22.365019] [] (ipv6_dev_mc_inc) from [] (addrconf_dad_work+0x21c/0x33c) [ 22.373460] [] (addrconf_dad_work) from [] (process_one_work+0x214/0x8d0) [ 22.381986] [] (process_one_work) from [] (worker_thread+0x48/0x4bc) [ 22.390071] [] (worker_thread) from [] (kthread+0xf0/0x108) [ 22.397381] [] (kthread) from [] Trace related to incorrect usage of mutex inside ndo_set_rx_mode [ 24.086066] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:616 [ 24.094506] in_atomic(): 1, irqs_disabled(): 0, pid: 1682, name: ifconfig [ 24.101291] INFO: lockdep is turned off. [ 24.105203] Preemption disabled at:[< (null)>] (null) [ 24.110511] [ 24.112005] CPU: 2 PID: 1682 Comm: ifconfig Tainted: G W 4.1.6-01265-gfb1e101 #1 [ 24.120518] Hardware name: Keystone [ 24.124018] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 24.131772] [] (show_stack) from [] (dump_stack+0x84/0xc4) [ 24.138989] [] (dump_stack) from [] (mutex_lock_nested+0x28/0x4a8) [ 24.146908] [] (mutex_lock_nested) from [] (netcp_set_rx_mode+0x160/0x210) [ 24.155523] [] (netcp_set_rx_mode) from [] (dev_set_rx_mode+0x1c/0x28) [ 24.163787] [] (dev_set_rx_mode) from [] (__dev_open+0xc4/0x10c) [ 24.171531] [] (__dev_open) from [] (__dev_change_flags+0x94/0x144) [ 24.179528] [] (__dev_change_flags) from [] (dev_change_flags+0x18/0x48) [ 24.187966] [] (dev_change_flags) from [] (devinet_ioctl+0x6dc/0x7e4) [ 24.196145] [] (devinet_ioctl) from [] (sock_ioctl+0x1d0/0x2a8) [ 24.203803] [] (sock_ioctl) from [] (do_vfs_ioctl+0x41c/0x688) [ 24.211373] [] (do_vfs_ioctl) from [] (SyS_ioctl+0x34/0x5c) [ 24.218676] [] (SyS_ioctl) from [] (ret_fast_syscall+0x0/0x54) [ 24.227156] IPv6: ADDRCONF(NETDEV_UP): eth1: link is not ready Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 8026daaf940b..9f9832f0dea9 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -367,7 +367,6 @@ int netcp_register_module(struct netcp_module *module) if (ret < 0) goto fail; } - mutex_unlock(&netcp_modules_lock); return 0; @@ -1395,7 +1394,6 @@ static void netcp_addr_sweep_del(struct netcp_intf *netcp) continue; dev_dbg(netcp->ndev_dev, "deleting address %pM, type %x\n", naddr->addr, naddr->type); - mutex_lock(&netcp_modules_lock); for_each_module(netcp, priv) { module = priv->netcp_module; if (!module->del_addr) @@ -1404,7 +1402,6 @@ static void netcp_addr_sweep_del(struct netcp_intf *netcp) naddr); WARN_ON(error); } - mutex_unlock(&netcp_modules_lock); netcp_addr_del(netcp, naddr); } } @@ -1421,7 +1418,7 @@ static void netcp_addr_sweep_add(struct netcp_intf *netcp) continue; dev_dbg(netcp->ndev_dev, "adding address %pM, type %x\n", naddr->addr, naddr->type); - mutex_lock(&netcp_modules_lock); + for_each_module(netcp, priv) { module = priv->netcp_module; if (!module->add_addr) @@ -1429,7 +1426,6 @@ static void netcp_addr_sweep_add(struct netcp_intf *netcp) error = module->add_addr(priv->module_priv, naddr); WARN_ON(error); } - mutex_unlock(&netcp_modules_lock); } } @@ -1443,6 +1439,7 @@ static void netcp_set_rx_mode(struct net_device *ndev) ndev->flags & IFF_ALLMULTI || netdev_mc_count(ndev) > NETCP_MAX_MCAST_ADDR); + spin_lock(&netcp->lock); /* first clear all marks */ netcp_addr_clear_mark(netcp); @@ -1461,6 +1458,7 @@ static void netcp_set_rx_mode(struct net_device *ndev) /* finally sweep and callout into modules */ netcp_addr_sweep_del(netcp); netcp_addr_sweep_add(netcp); + spin_unlock(&netcp->lock); } static void netcp_free_navigator_resources(struct netcp_intf *netcp) @@ -1625,7 +1623,6 @@ static int netcp_ndo_open(struct net_device *ndev) goto fail; } - mutex_lock(&netcp_modules_lock); for_each_module(netcp, intf_modpriv) { module = intf_modpriv->netcp_module; if (module->open) { @@ -1636,7 +1633,6 @@ static int netcp_ndo_open(struct net_device *ndev) } } } - mutex_unlock(&netcp_modules_lock); napi_enable(&netcp->rx_napi); napi_enable(&netcp->tx_napi); @@ -1653,7 +1649,6 @@ fail_open: if (module->close) module->close(intf_modpriv->module_priv, ndev); } - mutex_unlock(&netcp_modules_lock); fail: netcp_free_navigator_resources(netcp); @@ -1677,7 +1672,6 @@ static int netcp_ndo_stop(struct net_device *ndev) napi_disable(&netcp->rx_napi); napi_disable(&netcp->tx_napi); - mutex_lock(&netcp_modules_lock); for_each_module(netcp, intf_modpriv) { module = intf_modpriv->netcp_module; if (module->close) { @@ -1686,7 +1680,6 @@ static int netcp_ndo_stop(struct net_device *ndev) dev_err(netcp->ndev_dev, "Close failed\n"); } } - mutex_unlock(&netcp_modules_lock); /* Recycle Rx descriptors from completion queue */ netcp_empty_rx_queue(netcp); @@ -1714,7 +1707,6 @@ static int netcp_ndo_ioctl(struct net_device *ndev, if (!netif_running(ndev)) return -EINVAL; - mutex_lock(&netcp_modules_lock); for_each_module(netcp, intf_modpriv) { module = intf_modpriv->netcp_module; if (!module->ioctl) @@ -1730,7 +1722,6 @@ static int netcp_ndo_ioctl(struct net_device *ndev, } out: - mutex_unlock(&netcp_modules_lock); return (ret == 0) ? 0 : err; } @@ -1765,11 +1756,12 @@ static int netcp_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid) struct netcp_intf *netcp = netdev_priv(ndev); struct netcp_intf_modpriv *intf_modpriv; struct netcp_module *module; + unsigned long flags; int err = 0; dev_dbg(netcp->ndev_dev, "adding rx vlan id: %d\n", vid); - mutex_lock(&netcp_modules_lock); + spin_lock_irqsave(&netcp->lock, flags); for_each_module(netcp, intf_modpriv) { module = intf_modpriv->netcp_module; if ((module->add_vid) && (vid != 0)) { @@ -1781,7 +1773,8 @@ static int netcp_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid) } } } - mutex_unlock(&netcp_modules_lock); + spin_unlock_irqrestore(&netcp->lock, flags); + return err; } @@ -1790,11 +1783,12 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) struct netcp_intf *netcp = netdev_priv(ndev); struct netcp_intf_modpriv *intf_modpriv; struct netcp_module *module; + unsigned long flags; int err = 0; dev_dbg(netcp->ndev_dev, "removing rx vlan id: %d\n", vid); - mutex_lock(&netcp_modules_lock); + spin_lock_irqsave(&netcp->lock, flags); for_each_module(netcp, intf_modpriv) { module = intf_modpriv->netcp_module; if (module->del_vid) { @@ -1806,7 +1800,7 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) } } } - mutex_unlock(&netcp_modules_lock); + spin_unlock_irqrestore(&netcp->lock, flags); return err; } From aaa0062ecf4877a26dea66bee1039c6eaf906c94 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:43:41 +0100 Subject: [PATCH 086/116] 8139cp: Do not re-enable RX interrupts in cp_tx_timeout() If an RX interrupt was already received but NAPI has not yet run when the RX timeout happens, we end up in cp_tx_timeout() with RX interrupts already disabled. Blindly re-enabling them will cause an IRQ storm. (This is made particularly horrid by the fact that cp_interrupt() always returns that it's handled the interrupt, even when it hasn't actually done anything. If it didn't do that, the core IRQ code would have detected the storm and handled it, I'd have had a clear smoking gun backtrace instead of just a spontaneously resetting router, and I'd have at *least* two days of my life back. Changing the return value of cp_interrupt() will be argued about under separate cover.) Unconditionally leave RX interrupts disabled after the reset, and schedule NAPI to check the receive ring and re-enable them. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index ba3dab721806..947932d4b79e 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1262,9 +1262,10 @@ static void cp_tx_timeout(struct net_device *dev) rc = cp_init_rings(cp); cp_start_hw(cp); __cp_set_rx_mode(dev); - cp_enable_irq(cp); + cpw16_f(IntrMask, cp_norx_intr_mask); netif_wake_queue(dev); + napi_schedule_irqoff(&cp->napi); spin_unlock_irqrestore(&cp->lock, flags); } From 26b0bad6ac3a0167792dc4ffb276c29bc597d239 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:44:06 +0100 Subject: [PATCH 087/116] 8139cp: Fix tx_queued debug message to print correct slot numbers After a certain amount of staring at the debug output of this driver, I realised it was lying to me. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 947932d4b79e..cbca0de1e857 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -786,7 +786,8 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, wmb(); cp->tx_skb[entry] = skb; - entry = NEXT_TX(entry); + netif_dbg(cp, tx_queued, cp->dev, "tx queued, slot %d, skblen %d\n", + entry, skb->len); } else { struct cp_desc *txd; u32 first_len, first_eor; @@ -805,7 +806,6 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, goto out_dma_error; cp->tx_skb[entry] = skb; - entry = NEXT_TX(entry); for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) { const skb_frag_t *this_frag = &skb_shinfo(skb)->frags[frag]; @@ -813,6 +813,8 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, u32 ctrl; dma_addr_t mapping; + entry = NEXT_TX(entry); + len = skb_frag_size(this_frag); mapping = dma_map_single(&cp->pdev->dev, skb_frag_address(this_frag), @@ -848,9 +850,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->opts1 = cpu_to_le32(ctrl); wmb(); - cp->tx_skb[entry] = skb; - entry = NEXT_TX(entry); } txd = &cp->tx_ring[first_entry]; @@ -873,12 +873,13 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->opts1 = cpu_to_le32(first_eor | first_len | FirstFrag | DescOwn); wmb(); + + netif_dbg(cp, tx_queued, cp->dev, "tx queued, slots %d-%d, skblen %d\n", + first_entry, entry, skb->len); } - cp->tx_head = entry; + cp->tx_head = NEXT_TX(entry); netdev_sent_queue(dev, skb->len); - netif_dbg(cp, tx_queued, cp->dev, "tx queued, slot %d, skblen %d\n", - entry, skb->len); if (TX_BUFFS_AVAIL(cp) <= (MAX_SKB_FRAGS + 1)) netif_stop_queue(dev); From a3b804043f490aeec57d8ca5baccdd35e6250857 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:44:38 +0100 Subject: [PATCH 088/116] 8139cp: Fix TSO/scatter-gather descriptor setup When sending a TSO frame in multiple buffers, we were neglecting to set the first descriptor up in TSO mode. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index cbca0de1e857..75a8cee6b1d6 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -790,7 +790,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, entry, skb->len); } else { struct cp_desc *txd; - u32 first_len, first_eor; + u32 first_len, first_eor, ctrl; dma_addr_t first_mapping; int frag, first_entry = entry; const struct iphdr *ip = ip_hdr(skb); @@ -810,7 +810,6 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) { const skb_frag_t *this_frag = &skb_shinfo(skb)->frags[frag]; u32 len; - u32 ctrl; dma_addr_t mapping; entry = NEXT_TX(entry); @@ -858,20 +857,19 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->addr = cpu_to_le64(first_mapping); wmb(); - if (skb->ip_summed == CHECKSUM_PARTIAL) { + ctrl = first_eor | first_len | FirstFrag | DescOwn; + if (mss) + ctrl |= LargeSend | ((mss & MSSMask) << MSSShift); + else if (skb->ip_summed == CHECKSUM_PARTIAL) { if (ip->protocol == IPPROTO_TCP) - txd->opts1 = cpu_to_le32(first_eor | first_len | - FirstFrag | DescOwn | - IPCS | TCPCS); + ctrl |= IPCS | TCPCS; else if (ip->protocol == IPPROTO_UDP) - txd->opts1 = cpu_to_le32(first_eor | first_len | - FirstFrag | DescOwn | - IPCS | UDPCS); + ctrl |= IPCS | UDPCS; else BUG(); - } else - txd->opts1 = cpu_to_le32(first_eor | first_len | - FirstFrag | DescOwn); + } + + txd->opts1 = cpu_to_le32(ctrl); wmb(); netif_dbg(cp, tx_queued, cp->dev, "tx queued, slots %d-%d, skblen %d\n", From 0a5aeee0b79fa99d8e04c98dd4e87d4f52aa497b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:44:57 +0100 Subject: [PATCH 089/116] 8139cp: Reduce duplicate csum/tso code in cp_start_xmit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We calculate the value of the opts1 descriptor field in three different places. With two different behaviours when given an invalid packet to be checksummed — none of them correct. Sort that out. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 61 +++++++++------------------ 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 75a8cee6b1d6..b3bd8b103593 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -733,7 +733,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, { struct cp_private *cp = netdev_priv(dev); unsigned entry; - u32 eor, flags; + u32 eor, opts1; unsigned long intr_flags; __le32 opts2; int mss = 0; @@ -753,6 +753,21 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, mss = skb_shinfo(skb)->gso_size; opts2 = cpu_to_le32(cp_tx_vlan_tag(skb)); + opts1 = DescOwn; + if (mss) + opts1 |= LargeSend | ((mss & MSSMask) << MSSShift); + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + const struct iphdr *ip = ip_hdr(skb); + if (ip->protocol == IPPROTO_TCP) + opts1 |= IPCS | TCPCS; + else if (ip->protocol == IPPROTO_UDP) + opts1 |= IPCS | UDPCS; + else { + WARN_ONCE(1, + "Net bug: asked to checksum invalid Legacy IP packet\n"); + goto out_dma_error; + } + } if (skb_shinfo(skb)->nr_frags == 0) { struct cp_desc *txd = &cp->tx_ring[entry]; @@ -768,21 +783,9 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->addr = cpu_to_le64(mapping); wmb(); - flags = eor | len | DescOwn | FirstFrag | LastFrag; + opts1 |= eor | len | FirstFrag | LastFrag; - if (mss) - flags |= LargeSend | ((mss & MSSMask) << MSSShift); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { - const struct iphdr *ip = ip_hdr(skb); - if (ip->protocol == IPPROTO_TCP) - flags |= IPCS | TCPCS; - else if (ip->protocol == IPPROTO_UDP) - flags |= IPCS | UDPCS; - else - WARN_ON(1); /* we need a WARN() */ - } - - txd->opts1 = cpu_to_le32(flags); + txd->opts1 = cpu_to_le32(opts1); wmb(); cp->tx_skb[entry] = skb; @@ -793,7 +796,6 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, u32 first_len, first_eor, ctrl; dma_addr_t first_mapping; int frag, first_entry = entry; - const struct iphdr *ip = ip_hdr(skb); /* We must give this initial chunk to the device last. * Otherwise we could race with the device. @@ -825,19 +827,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0; - ctrl = eor | len | DescOwn; - - if (mss) - ctrl |= LargeSend | - ((mss & MSSMask) << MSSShift); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (ip->protocol == IPPROTO_TCP) - ctrl |= IPCS | TCPCS; - else if (ip->protocol == IPPROTO_UDP) - ctrl |= IPCS | UDPCS; - else - BUG(); - } + ctrl = opts1 | eor | len; if (frag == skb_shinfo(skb)->nr_frags - 1) ctrl |= LastFrag; @@ -857,18 +847,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->addr = cpu_to_le64(first_mapping); wmb(); - ctrl = first_eor | first_len | FirstFrag | DescOwn; - if (mss) - ctrl |= LargeSend | ((mss & MSSMask) << MSSShift); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (ip->protocol == IPPROTO_TCP) - ctrl |= IPCS | TCPCS; - else if (ip->protocol == IPPROTO_UDP) - ctrl |= IPCS | UDPCS; - else - BUG(); - } - + ctrl = opts1 | first_eor | first_len | FirstFrag; txd->opts1 = cpu_to_le32(ctrl); wmb(); From 7f4c685633e2df9ba10d49a31dda13715745db37 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:45:16 +0100 Subject: [PATCH 090/116] 8139cp: Fix DMA unmapping of transmitted buffers The low 16 bits of the 'opts1' field in the TX descriptor are supposed to still contain the buffer length when the descriptor is handed back to us. In practice, at least on my hardware, they don't. So stash the original value of the opts1 field and get the length to unmap from there. There are other ways we could have worked out the length, but I actually want a stash of the opts1 field anyway so that I can dump it alongside the contents of the descriptor ring when we suffer a TX timeout. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index b3bd8b103593..ec6bd864b263 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -341,6 +341,7 @@ struct cp_private { unsigned tx_tail; struct cp_desc *tx_ring; struct sk_buff *tx_skb[CP_TX_RING_SIZE]; + u32 tx_opts[CP_TX_RING_SIZE]; unsigned rx_buf_sz; unsigned wol_enabled : 1; /* Is Wake-on-LAN enabled? */ @@ -665,7 +666,7 @@ static void cp_tx (struct cp_private *cp) BUG_ON(!skb); dma_unmap_single(&cp->pdev->dev, le64_to_cpu(txd->addr), - le32_to_cpu(txd->opts1) & 0xffff, + cp->tx_opts[tx_tail] & 0xffff, PCI_DMA_TODEVICE); if (status & LastFrag) { @@ -789,6 +790,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, wmb(); cp->tx_skb[entry] = skb; + cp->tx_opts[entry] = opts1; netif_dbg(cp, tx_queued, cp->dev, "tx queued, slot %d, skblen %d\n", entry, skb->len); } else { @@ -839,6 +841,8 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->opts1 = cpu_to_le32(ctrl); wmb(); + + cp->tx_opts[entry] = ctrl; cp->tx_skb[entry] = skb; } @@ -851,6 +855,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb, txd->opts1 = cpu_to_le32(ctrl); wmb(); + cp->tx_opts[first_entry] = ctrl; netif_dbg(cp, tx_queued, cp->dev, "tx queued, slots %d-%d, skblen %d\n", first_entry, entry, skb->len); } @@ -1093,6 +1098,7 @@ static int cp_init_rings (struct cp_private *cp) { memset(cp->tx_ring, 0, sizeof(struct cp_desc) * CP_TX_RING_SIZE); cp->tx_ring[CP_TX_RING_SIZE - 1].opts1 = cpu_to_le32(RingEnd); + memset(cp->tx_opts, 0, sizeof(cp->tx_opts)); cp_init_rings_index(cp); @@ -1150,6 +1156,7 @@ static void cp_clean_rings (struct cp_private *cp) memset(cp->rx_ring, 0, sizeof(struct cp_desc) * CP_RX_RING_SIZE); memset(cp->tx_ring, 0, sizeof(struct cp_desc) * CP_TX_RING_SIZE); + memset(cp->tx_opts, 0, sizeof(cp->tx_opts)); memset(cp->rx_skb, 0, sizeof(struct sk_buff *) * CP_RX_RING_SIZE); memset(cp->tx_skb, 0, sizeof(struct sk_buff *) * CP_TX_RING_SIZE); From 41b976414c88016e2c9d9b2f6667ee67a998d388 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 09:45:31 +0100 Subject: [PATCH 091/116] 8139cp: Dump contents of descriptor ring on TX timeout We are seeing unexplained TX timeouts under heavy load. Let's try to get a better idea of what's going on. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139cp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index ec6bd864b263..686334f4588d 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -157,6 +157,7 @@ enum { NWayAdvert = 0x66, /* MII ADVERTISE */ NWayLPAR = 0x68, /* MII LPA */ NWayExpansion = 0x6A, /* MII Expansion */ + TxDmaOkLowDesc = 0x82, /* Low 16 bit address of a Tx descriptor. */ Config5 = 0xD8, /* Config5 */ TxPoll = 0xD9, /* Tell chip to check Tx descriptors for work */ RxMaxSize = 0xDA, /* Max size of an Rx packet (8169 only) */ @@ -1234,7 +1235,7 @@ static void cp_tx_timeout(struct net_device *dev) { struct cp_private *cp = netdev_priv(dev); unsigned long flags; - int rc; + int rc, i; netdev_warn(dev, "Transmit timeout, status %2x %4x %4x %4x\n", cpr8(Cmd), cpr16(CpCmd), @@ -1242,6 +1243,17 @@ static void cp_tx_timeout(struct net_device *dev) spin_lock_irqsave(&cp->lock, flags); + netif_dbg(cp, tx_err, cp->dev, "TX ring head %d tail %d desc %x\n", + cp->tx_head, cp->tx_tail, cpr16(TxDmaOkLowDesc)); + for (i = 0; i < CP_TX_RING_SIZE; i++) { + netif_dbg(cp, tx_err, cp->dev, + "TX slot %d @%p: %08x (%08x) %08x %llx %p\n", + i, &cp->tx_ring[i], le32_to_cpu(cp->tx_ring[i].opts1), + cp->tx_opts[i], le32_to_cpu(cp->tx_ring[i].opts2), + le64_to_cpu(cp->tx_ring[i].addr), + cp->tx_skb[i]); + } + cp_stop_hw(cp); cp_clean_rings(cp); rc = cp_init_rings(cp); From 7bbe33ff1896d225588b37c574c0d80dbc63c657 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Tue, 22 Sep 2015 13:09:32 -0400 Subject: [PATCH 092/116] geneve: use network byte order for destination port config parameter This is primarily for consistancy with vxlan and other tunnels which use network byte order for similar parameters. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- drivers/net/geneve.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 4ce8b7b90c7c..8f5c02eed47d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -815,7 +815,7 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, static int geneve_configure(struct net *net, struct net_device *dev, __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos, - __u16 dst_port, bool metadata) + __be16 dst_port, bool metadata) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -840,10 +840,10 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->ttl = ttl; geneve->tos = tos; - geneve->dst_port = htons(dst_port); + geneve->dst_port = dst_port; geneve->collect_md = metadata; - t = geneve_find_dev(gn, htons(dst_port), rem_addr, geneve->vni, + t = geneve_find_dev(gn, dst_port, rem_addr, geneve->vni, &tun_on_same_port, &tun_collect_md); if (t) return -EBUSY; @@ -867,7 +867,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, static int geneve_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - __u16 dst_port = GENEVE_UDP_PORT; + __be16 dst_port = htons(GENEVE_UDP_PORT); __u8 ttl = 0, tos = 0; bool metadata = false; __be32 rem_addr; @@ -886,7 +886,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev, tos = nla_get_u8(data[IFLA_GENEVE_TOS]); if (data[IFLA_GENEVE_PORT]) - dst_port = nla_get_u16(data[IFLA_GENEVE_PORT]); + dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]); if (data[IFLA_GENEVE_COLLECT_METADATA]) metadata = true; @@ -909,7 +909,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ - nla_total_size(sizeof(__u16)) + /* IFLA_GENEVE_PORT */ + nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ 0; } @@ -931,7 +931,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos)) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port))) + if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port)) goto nla_put_failure; if (geneve->collect_md) { @@ -971,7 +971,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, if (IS_ERR(dev)) return dev; - err = geneve_configure(net, dev, 0, 0, 0, 0, dst_port, true); + err = geneve_configure(net, dev, 0, 0, 0, 0, htons(dst_port), true); if (err) { free_netdev(dev); return ERR_PTR(err); From da314c9923fed553a007785a901fd395b7eb6c19 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Sep 2015 11:38:56 +0800 Subject: [PATCH 093/116] netlink: Replace rhash_portid with bound On Mon, Sep 21, 2015 at 02:20:22PM -0400, Tejun Heo wrote: > > store_release and load_acquire are different from the usual memory > barriers and can't be paired this way. You have to pair store_release > and load_acquire. Besides, it isn't a particularly good idea to OK I've decided to drop the acquire/release helpers as they don't help us at all and simply pessimises the code by using full memory barriers (on some architectures) where only a write or read barrier is needed. > depend on memory barriers embedded in other data structures like the > above. Here, especially, rhashtable_insert() would have write barrier > *before* the entry is hashed not necessarily *after*, which means that > in the above case, a socket which appears to have set bound to a > reader might not visible when the reader tries to look up the socket > on the hashtable. But you are right we do need an explicit write barrier here to ensure that the hashing is visible. > There's no reason to be overly smart here. This isn't a crazy hot > path, write barriers tend to be very cheap, store_release more so. > Please just do smp_store_release() and note what it's paired with. It's not about being overly smart. It's about actually understanding what's going on with the code. I've seen too many instances of people simply sprinkling synchronisation primitives around without any knowledge of what is happening underneath, which is just a recipe for creating hard-to-debug races. > > @@ -1539,7 +1546,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, > > } > > } > > > > - if (!nlk->portid) { > > + if (!nlk->bound) { > > I don't think you can skip load_acquire here just because this is the > second deref of the variable. That doesn't change anything. Race > condition could still happen between the first and second tests and > skipping the second would lead to the same kind of bug. The reason this one is OK is because we do not use nlk->portid or try to get nlk from the hash table before we return to user-space. However, there is a real bug here that none of these acquire/release helpers discovered. The two bound tests here used to be a single one. Now that they are separate it is entirely possible for another thread to come in the middle and bind the socket. So we need to repeat the portid check in order to maintain consistency. > > @@ -1587,7 +1594,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, > > !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) > > return -EPERM; > > > > - if (!nlk->portid) > > + if (!nlk->bound) > > Don't we need load_acquire here too? Is this path holding a lock > which makes that unnecessary? Ditto. ---8<--- The commit 1f770c0a09da855a2b51af6d19de97fb955eca85 ("netlink: Fix autobind race condition that leads to zero port ID") created some new races that can occur due to inconcsistencies between the two port IDs. Tejun is right that a barrier is unavoidable. Therefore I am reverting to the original patch that used a boolean to indicate that a user netlink socket has been bound. Barriers have been added where necessary to ensure that a valid portid and the hashed socket is visible. I have also changed netlink_insert to only return EBUSY if the socket is bound to a portid different to the requested one. This combined with only reading nlk->bound once in netlink_bind fixes a race where two threads that bind the socket at the same time with different port IDs may both succeed. Fixes: 1f770c0a09da ("netlink: Fix autobind race condition that leads to zero port ID") Reported-by: Tejun Heo Reported-by: Linus Torvalds Signed-off-by: Herbert Xu Nacked-by: Tejun Heo Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 39 ++++++++++++++++++++++++++++----------- net/netlink/af_netlink.h | 2 +- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9f51608b968a..8f060d7f9a0e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1031,7 +1031,7 @@ static inline int netlink_compare(struct rhashtable_compare_arg *arg, const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; - return nlk->rhash_portid != x->portid || + return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } @@ -1057,7 +1057,7 @@ static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->rhash_portid); + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); @@ -1110,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid) lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; @@ -1119,7 +1119,7 @@ static int netlink_insert(struct sock *sk, u32 portid) unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; - nlk_sk(sk)->rhash_portid = portid; + nlk_sk(sk)->portid = portid; sock_hold(sk); err = __netlink_insert(table, sk); @@ -1135,7 +1135,9 @@ static int netlink_insert(struct sock *sk, u32 portid) goto err; } - nlk_sk(sk)->portid = portid; + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; err: release_sock(sk); @@ -1521,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1537,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->portid) + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; + } if (nlk->netlink_bind && groups) { int group; @@ -1555,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, } } - if (!nlk->portid) { + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); @@ -1603,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -2444,10 +2458,13 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } /* It's a really convoluted way for userland to ask for mmaped @@ -3273,7 +3290,7 @@ static inline u32 netlink_hash(const void *data, u32 len, u32 seed) const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->rhash_portid); + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 80b2b7526dfd..14437d9b1965 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -25,7 +25,6 @@ struct netlink_ring { struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; - u32 rhash_portid; u32 portid; u32 dst_portid; u32 dst_group; @@ -36,6 +35,7 @@ struct netlink_sock { unsigned long state; size_t max_recvmsg_len; wait_queue_head_t wait; + bool bound; bool cb_running; struct netlink_callback cb; struct mutex *cb_mutex; From 6ae459bdaaeebc632b16e54dcbabb490c6931d61 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 22 Sep 2015 12:57:53 -0700 Subject: [PATCH 094/116] skbuff: Fix skb checksum flag on skb pull VXLAN device can receive skb with checksum partial. But the checksum offset could be in outer header which is pulled on receive. This results in negative checksum offset for the skb. Such skb can cause the assert failure in skb_checksum_help(). Following patch fixes the bug by setting checksum-none while pulling outer header. Following is the kernel panic msg from old kernel hitting the bug. ------------[ cut here ]------------ kernel BUG at net/core/dev.c:1906! RIP: 0010:[] skb_checksum_help+0x144/0x150 Call Trace: [] queue_userspace_packet+0x408/0x470 [openvswitch] [] ovs_dp_upcall+0x5d/0x60 [openvswitch] [] ovs_dp_process_packet_with_key+0xe6/0x100 [openvswitch] [] ovs_dp_process_received_packet+0x4b/0x80 [openvswitch] [] ovs_vport_receive+0x2a/0x30 [openvswitch] [] vxlan_rcv+0x53/0x60 [openvswitch] [] vxlan_udp_encap_recv+0x8b/0xf0 [openvswitch] [] udp_queue_rcv_skb+0x2dc/0x3b0 [] __udp4_lib_rcv+0x1cf/0x6c0 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xdd/0x280 [] ip_local_deliver+0x88/0x90 [] ip_rcv_finish+0x10d/0x370 [] ip_rcv+0x235/0x300 [] __netif_receive_skb+0x55d/0x620 [] netif_receive_skb+0x80/0x90 [] virtnet_poll+0x555/0x6f0 [] net_rx_action+0x134/0x290 [] __do_softirq+0xa8/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0x65/0xa0 [] irq_exit+0x8e/0xb0 [] do_IRQ+0x63/0xe0 [] common_interrupt+0x6e/0x6e Reported-by: Anupam Chanda Signed-off-by: Pravin B Shelar Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9987af080fa0..2b0a30a6e31c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2707,6 +2707,9 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); + else if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start_offset(skb) <= len) + skb->ip_summed = CHECKSUM_NONE; } unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); From d5b8d6404395641987db76e28334cae4cef771ae Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 21 Sep 2015 16:47:09 +0100 Subject: [PATCH 095/116] net: gianfar: remove misuse of IRQF_NO_SUSPEND flag The device is set as wakeup capable using proper wakeup API but the driver misuses IRQF_NO_SUSPEND to set the interrupt as wakeup source which is incorrect. This patch removes the use of IRQF_NO_SUSPEND flags replacing it with enable_irq_wake instead. Cc: "David S. Miller" Cc: Claudiu Manoil Cc: Kevin Hao Cc: netdev@vger.kernel.org Signed-off-by: Sudeep Holla Acked-by: Claudiu Manoil Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 4b69d061d90f..803ed4c93503 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1970,8 +1970,7 @@ static int register_grp_irqs(struct gfar_priv_grp *grp) /* Install our interrupt handlers for Error, * Transmit, and Receive */ - err = request_irq(gfar_irq(grp, ER)->irq, gfar_error, - IRQF_NO_SUSPEND, + err = request_irq(gfar_irq(grp, ER)->irq, gfar_error, 0, gfar_irq(grp, ER)->name, grp); if (err < 0) { netif_err(priv, intr, dev, "Can't get IRQ %d\n", @@ -1979,6 +1978,8 @@ static int register_grp_irqs(struct gfar_priv_grp *grp) goto err_irq_fail; } + enable_irq_wake(gfar_irq(grp, ER)->irq); + err = request_irq(gfar_irq(grp, TX)->irq, gfar_transmit, 0, gfar_irq(grp, TX)->name, grp); if (err < 0) { @@ -1994,14 +1995,14 @@ static int register_grp_irqs(struct gfar_priv_grp *grp) goto rx_irq_fail; } } else { - err = request_irq(gfar_irq(grp, TX)->irq, gfar_interrupt, - IRQF_NO_SUSPEND, + err = request_irq(gfar_irq(grp, TX)->irq, gfar_interrupt, 0, gfar_irq(grp, TX)->name, grp); if (err < 0) { netif_err(priv, intr, dev, "Can't get IRQ %d\n", gfar_irq(grp, TX)->irq); goto err_irq_fail; } + enable_irq_wake(gfar_irq(grp, TX)->irq); } return 0; From 63d008a4e9ee86614ca5671b7f3ba447df007190 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:11 +0200 Subject: [PATCH 096/116] ipv4: send arp replies to the correct tunnel When using ip lwtunnels, the additional data for xmit (basically, the actual tunnel to use) are carried in ip_tunnel_info either in dst->lwtstate or in metadata dst. When replying to ARP requests, we need to send the reply to the same tunnel the request came from. This means we need to construct proper metadata dst for ARP replies. We could perform another route lookup to get a dst entry with the correct lwtstate. However, this won't always ensure that the outgoing tunnel is the same as the incoming one, and it won't work anyway for IPv4 duplicate address detection. The only thing to do is to "reverse" the ip_tunnel_info. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ net/ipv4/arp.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/ip_tunnel_core.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9a6a3ba888e8..f6dafec9102c 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -276,6 +276,8 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b75e925..f03db8b7abee 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include #include #include +#include +#include #include @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw, struct sk_buff *oskb) + const unsigned char *target_hw, + struct dst_entry *dst) { struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, if (!skb) return; - if (oskb) - skb_dst_copy(skb, oskb); - + skb_dst_set(skb, dst); arp_xmit(skb); } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = dst_clone(skb_dst(skb)); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL, - dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -649,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -774,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -794,9 +804,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 9b97204b8c81..ce3a1e728606 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,6 +46,7 @@ #include #include #include +#include int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) From b194f30c61efb0767a98f47a64530baa8b731670 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:12 +0200 Subject: [PATCH 097/116] lwtunnel: remove source and destination UDP port config option The UDP tunnel config is asymmetric wrt. to the ports used. The source and destination ports from one direction of the tunnel are not related to the ports of the other direction. We need to be able to respond to ARP requests using the correct ports without involving routing. As the consequence, UDP ports need to be fixed property of the tunnel interface and cannot be set per route. Remove the ability to set ports per route. This is still okay to do, as no kernel has been released with these attributes yet. Note that the ability to specify source and destination ports is preserved for other users of the lwtunnel API which don't use routes for tunnel key specification (like openvswitch). If in the future we rework ARP handling to allow port specification, the attributes can be added back. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 4 ---- net/ipv4/ip_tunnel_core.c | 24 ------------------------ 2 files changed, 28 deletions(-) diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 34141a5dfe74..f8b01887a495 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -21,8 +21,6 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_SRC, LWTUNNEL_IP_TTL, LWTUNNEL_IP_TOS, - LWTUNNEL_IP_SPORT, - LWTUNNEL_IP_DPORT, LWTUNNEL_IP_FLAGS, __LWTUNNEL_IP_MAX, }; @@ -36,8 +34,6 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_SRC, LWTUNNEL_IP6_HOPLIMIT, LWTUNNEL_IP6_TC, - LWTUNNEL_IP6_SPORT, - LWTUNNEL_IP6_DPORT, LWTUNNEL_IP6_FLAGS, __LWTUNNEL_IP6_MAX, }; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce3a1e728606..84dce6a92f93 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -226,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, - [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; @@ -267,12 +265,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - - if (tb[LWTUNNEL_IP_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); @@ -294,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -309,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } @@ -333,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, - [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; @@ -374,12 +360,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - - if (tb[LWTUNNEL_IP6_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); - if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); @@ -401,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -416,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ } From d8aecb10115497f6cdf841df8c88ebb3ba25fa28 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 22 Sep 2015 17:01:11 -0700 Subject: [PATCH 098/116] net: revert "net_sched: move tp->root allocation into fw_init()" fw filter uses tp->root==NULL to check if it is the old method, so it doesn't need allocation at all in this case. This patch reverts the offending commit and adds some comments for old method to make it obvious. Fixes: 33f8b9ecdb15 ("net_sched: move tp->root allocation into fw_init()") Reported-by: Akshat Kakkar Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_fw.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 715e01e5910a..f23a3b68bba6 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,7 +33,6 @@ struct fw_head { u32 mask; - bool mask_set; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; @@ -84,7 +83,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { - /* old method */ + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; @@ -114,14 +113,9 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) static int fw_init(struct tcf_proto *tp) { - struct fw_head *head; - - head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; - - head->mask_set = false; - rcu_assign_pointer(tp->root, head); + /* We don't allocate fw_head here, because in the old method + * we don't need it at all. + */ return 0; } @@ -252,7 +246,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, int err; if (!opt) - return handle ? -EINVAL : 0; + return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); if (err < 0) @@ -302,11 +296,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!handle) return -EINVAL; - if (!head->mask_set) { - head->mask = 0xFFFFFFFF; + if (!head) { + u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) - head->mask = nla_get_u32(tb[TCA_FW_MASK]); - head->mask_set = true; + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + head->mask = mask; + + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); From d682d2bdc30650a5c7ce9908ab83ab674b658744 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Sep 2015 17:04:58 -0700 Subject: [PATCH 099/116] bnx2x: byte swap rss_key to comply to Toeplitz specs After a good amount of debugging, I found bnx2x was byte swaping the 40 bytes of rss_key. If we byte swap the key, then bnx2x generates hashes matching MSDN specs as documented in (Verifying the RSS Hash Calculation) https://msdn.microsoft.com/en-us/library/windows/hardware/ff571021% 28v=vs.85%29.aspx It is mostly a non issue, unless we want to mix different NIC in a host, and want consistent hashing among all of them, ie if they all use the boot time generated rss key, or if some application is choosing specific tuple(s) so that incoming traffic lands into known rx queue(s). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c index c9bd7f16018e..ff702a707a91 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c @@ -4319,8 +4319,16 @@ static int bnx2x_setup_rss(struct bnx2x *bp, /* RSS keys */ if (test_bit(BNX2X_RSS_SET_SRCH, &p->rss_flags)) { - memcpy(&data->rss_key[0], &p->rss_key[0], - sizeof(data->rss_key)); + u8 *dst = (u8 *)(data->rss_key) + sizeof(data->rss_key); + const u8 *src = (const u8 *)p->rss_key; + int i; + + /* Apparently, bnx2x reads this array in reverse order + * We need to byte swap rss_key to comply with Toeplitz specs. + */ + for (i = 0; i < sizeof(data->rss_key); i++) + *--dst = *src++; + caps |= ETH_RSS_UPDATE_RAMROD_DATA_UPDATE_RSS_KEY; } From 41fc014332d91ee90c32840bf161f9685b7fbf2b Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Tue, 22 Sep 2015 21:40:22 -0700 Subject: [PATCH 100/116] fib_rules: fix fib rule dumps across multiple skbs dump_rules returns skb length and not error. But when family == AF_UNSPEC, the caller of dump_rules assumes that it returns an error. Hence, when family == AF_UNSPEC, we continue trying to dump on -EMSGSIZE errors resulting in incorrect dump idx carried between skbs belonging to the same dump. This results in fib rule dump always only dumping rules that fit into the first skb. This patch fixes dump_rules to return error so that we exit correctly and idx is correctly maintained between skbs that are part of the same dump. Signed-off-by: Wilson Kok Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/fib_rules.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bf77e3639ce0..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -631,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -648,7 +650,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -664,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); From a46496ce38eeb401344d5623c1960dbf2f1769be Mon Sep 17 00:00:00 2001 From: Matt Bennett Date: Wed, 23 Sep 2015 16:58:31 +1200 Subject: [PATCH 101/116] ip6_gre: Reduce log level in ip6gre_err() to debug Currently error log messages in ip6gre_err are printed at 'warn' level. This is different to most other tunnel types which don't print any messages. These log messages don't provide any information that couldn't be deduced with networking tools. Also it can be annoying to have one end of the tunnel go down and have the logs fill with pointless messages such as "Path to destination invalid or inactive!". This patch reduces the log level of these messages to 'dbg' level to bring the visible behaviour into line with other tunnel types. Signed-off-by: Matt Bennett Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 646512488c28..3c7b9310b33f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); break; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); } break; case ICMPV6_PARAMPROB: @@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: From 17a10c9215b39a5ea8faeb241759c1aee6553919 Mon Sep 17 00:00:00 2001 From: Matt Bennett Date: Fri, 25 Sep 2015 11:01:47 +1200 Subject: [PATCH 102/116] ip6_tunnel: Reduce log level in ip6_tnl_err() to debug Currently error log messages in ip6_tnl_err are printed at 'warn' level. This is different to other tunnel types which don't print any messages. These log messages don't provide any information that couldn't be deduced with networking tools. Also it can be annoying to have one end of the tunnel go down and have the logs fill with pointless messages such as "Path to destination invalid or inactive!". This patch reduces the log level of these messages to 'dbg' level to bring the visible behaviour into line with other tunnel types. Signed-off-by: Matt Bennett Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 983f0d20f96d..eabffbb89795 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -569,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -588,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: From a136442131443d929d2d8d243157824de4dfbae8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:35:52 +0100 Subject: [PATCH 103/116] phy: fix of_mdio_find_bus() device refcount leak of_mdio_find_bus() leaks a struct device refcount, caused by using class_find_device() and not realising that the device reference has its refcount incremented: * Note, you will need to drop the reference with put_device() after use. ... while ((dev = class_dev_iter_next(&iter))) { if (match(dev, data)) { get_device(dev); break; } Update the comment, and arrange for the phy code to drop this refcount when disposing of a reference to it. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/mdio-mux.c | 19 +++++++++++++------ drivers/net/phy/mdio_bus.c | 4 +++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c index 4d4d25efc1e1..280c7c311f72 100644 --- a/drivers/net/phy/mdio-mux.c +++ b/drivers/net/phy/mdio-mux.c @@ -113,18 +113,18 @@ int mdio_mux_init(struct device *dev, if (!parent_bus_node) return -ENODEV; - parent_bus = of_mdio_find_bus(parent_bus_node); - if (parent_bus == NULL) { - ret_val = -EPROBE_DEFER; - goto err_parent_bus; - } - pb = devm_kzalloc(dev, sizeof(*pb), GFP_KERNEL); if (pb == NULL) { ret_val = -ENOMEM; goto err_parent_bus; } + parent_bus = of_mdio_find_bus(parent_bus_node); + if (parent_bus == NULL) { + ret_val = -EPROBE_DEFER; + goto err_parent_bus; + } + pb->switch_data = data; pb->switch_fn = switch_fn; pb->current_child = -1; @@ -173,6 +173,10 @@ int mdio_mux_init(struct device *dev, dev_info(dev, "Version " DRV_VERSION "\n"); return 0; } + + /* balance the reference of_mdio_find_bus() took */ + put_device(&pb->mii_bus->dev); + err_parent_bus: of_node_put(parent_bus_node); return ret_val; @@ -189,6 +193,9 @@ void mdio_mux_uninit(void *mux_handle) mdiobus_free(cb->mii_bus); cb = cb->next; } + + /* balance the reference of_mdio_find_bus() in mdio_mux_init() took */ + put_device(&pb->mii_bus->dev); } EXPORT_SYMBOL_GPL(mdio_mux_uninit); diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 02a4615b65f8..67553e13bd36 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -167,7 +167,9 @@ static int of_mdio_bus_match(struct device *dev, const void *mdio_bus_np) * of_mdio_find_bus - Given an mii_bus node, find the mii_bus. * @mdio_bus_np: Pointer to the mii_bus. * - * Returns a pointer to the mii_bus, or NULL if none found. + * Returns a reference to the mii_bus, or NULL if none found. The + * embedded struct device will have its reference count incremented, + * and this must be put once the bus is finished with. * * Because the association of a device_node and mii_bus is made via * of_mdiobus_register(), the mii_bus cannot be found before it is From e496ae690b2faff751e1849fb97b060615e21f28 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:35:57 +0100 Subject: [PATCH 104/116] net: dsa: fix of_mdio_find_bus() device refcount leak Current users of of_mdio_find_bus() leak a struct device refcount, as they fail to clean up the reference obtained inside class_find_device(). Fix the DSA code to properly refcount the returned MDIO bus by: 1. taking a reference on the struct device whenever we assign it to pd->chip[x].host_dev. 2. dropping the reference when we overwrite the existing reference. 3. dropping the reference when we free the data structure. 4. dropping the initial reference we obtained after setting up the platform data structure, or on failure. In step 2 above, where we obtain a new MDIO bus, there is no need to take a reference on it as we would only have to drop it immediately after assignment again, iow: put_device(cd->host_dev); /* drop original assignment ref */ cd->host_dev = get_device(&mdio_bus_switch->dev); /* get our ref */ put_device(&mdio_bus_switch->dev); /* drop of_mdio_find_bus ref */ Signed-off-by: Russell King Signed-off-by: David S. Miller --- net/dsa/dsa.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 76e3800765f8..bf4ba15fb780 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -634,6 +634,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) port_index++; } kfree(pd->chip[i].rtable); + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -661,16 +665,22 @@ static int dsa_of_probe(struct device *dev) return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) - return -EINVAL; + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto out_put_mdio; + } dev->platform_data = pd; pd->of_netdev = ethernet_dev; @@ -691,7 +701,9 @@ static int dsa_of_probe(struct device *dev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->host_dev = &mdio_bus->dev; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) @@ -711,6 +723,12 @@ static int dsa_of_probe(struct device *dev) ret = -EPROBE_DEFER; goto out_free_chip; } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } @@ -744,6 +762,10 @@ static int dsa_of_probe(struct device *dev) } } + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + return 0; out_free_chip: @@ -751,6 +773,8 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_mdio: + put_device(&mdio_bus->dev); return ret; } From 3e3aaf649416988ca8be4ad2c52dc24d8be7b46e Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:02 +0100 Subject: [PATCH 105/116] phy: fix mdiobus module safety Re-implement the mdiobus module refcounting to ensure that we actually ensure that the mdiobus module code does not go away while we might call into it. The old scheme using bus->dev.driver was buggy, because bus->dev is a class device which never has a struct device_driver associated with it, and hence the associated code trying to obtain a refcount did nothing useful. Instead, take the approach that other subsystems do: pass the module when calling mdiobus_register(), and record that in the mii_bus struct. When we need to increment the module use count in the phy code, use this stored pointer. When the phy is deteched, drop the module refcount, remembering that the phy device might go away at that point. This doesn't stop the mii_bus going away while there are in-use phys - it merely stops the underlying code vanishing. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 5 +++-- drivers/net/phy/phy_device.c | 32 ++++++++++++++++++-------------- include/linux/phy.h | 5 ++++- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 67553e13bd36..992406624b7c 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -244,7 +244,7 @@ static inline void of_mdiobus_link_phydev(struct mii_bus *mdio, * * Returns 0 on success or < 0 on error. */ -int mdiobus_register(struct mii_bus *bus) +int __mdiobus_register(struct mii_bus *bus, struct module *owner) { int i, err; @@ -255,6 +255,7 @@ int mdiobus_register(struct mii_bus *bus) BUG_ON(bus->state != MDIOBUS_ALLOCATED && bus->state != MDIOBUS_UNREGISTERED); + bus->owner = owner; bus->dev.parent = bus->parent; bus->dev.class = &mdio_bus_class; bus->dev.groups = NULL; @@ -296,7 +297,7 @@ error: device_del(&bus->dev); return err; } -EXPORT_SYMBOL(mdiobus_register); +EXPORT_SYMBOL(__mdiobus_register); void mdiobus_unregister(struct mii_bus *bus) { diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c0f211127274..03adf328f49b 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -582,10 +582,15 @@ EXPORT_SYMBOL(phy_init_hw); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface) { + struct mii_bus *bus = phydev->bus; struct device *d = &phydev->dev; - struct module *bus_module; int err; + if (!try_module_get(bus->owner)) { + dev_err(&dev->dev, "failed to get the bus module\n"); + return -EIO; + } + /* Assume that if there is no driver, that it doesn't * exist, and we should use the genphy driver. */ @@ -600,20 +605,13 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, err = device_bind_driver(d); if (err) - return err; + goto error; } if (phydev->attached_dev) { dev_err(&dev->dev, "PHY already attached\n"); - return -EBUSY; - } - - /* Increment the bus module reference count */ - bus_module = phydev->bus->dev.driver ? - phydev->bus->dev.driver->owner : NULL; - if (!try_module_get(bus_module)) { - dev_err(&dev->dev, "failed to get the bus module\n"); - return -EIO; + err = -EBUSY; + goto error; } phydev->attached_dev = dev; @@ -636,6 +634,10 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, phy_resume(phydev); return err; + +error: + module_put(bus->owner); + return err; } EXPORT_SYMBOL(phy_attach_direct); @@ -680,11 +682,9 @@ EXPORT_SYMBOL(phy_attach); */ void phy_detach(struct phy_device *phydev) { + struct mii_bus *bus; int i; - if (phydev->bus->dev.driver) - module_put(phydev->bus->dev.driver->owner); - phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; phy_suspend(phydev); @@ -700,6 +700,10 @@ void phy_detach(struct phy_device *phydev) break; } } + + bus = phydev->bus; + + module_put(bus->owner); } EXPORT_SYMBOL(phy_detach); diff --git a/include/linux/phy.h b/include/linux/phy.h index 962387a192f1..11bce44f6d65 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +154,7 @@ struct sk_buff; * PHYs should register using this structure */ struct mii_bus { + struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; @@ -198,7 +200,8 @@ static inline struct mii_bus *mdiobus_alloc(void) return mdiobus_alloc_size(0); } -int mdiobus_register(struct mii_bus *bus); +int __mdiobus_register(struct mii_bus *bus, struct module *owner); +#define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); From 7322967bc1bd97ac9c49ecea19e5a1f681ca27ee Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:08 +0100 Subject: [PATCH 106/116] phy: add proper phy struct device refcounting Take a refcount on the phy struct device when the phy device is attached to a network device, and drop it after it's detached. This ensures that a refcount is held on the phy device while the device is being used by a network device, thereby preventing the phy_device from being unexpectedly kfree()'d by phy_device_release(). Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 03adf328f49b..97a4f52addac 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -578,6 +578,7 @@ EXPORT_SYMBOL(phy_init_hw); * generic driver is used. The phy_device is given a ptr to * the attaching device, and given a callback for link status * change. The phy_device is returned to the attaching driver. + * This function takes a reference on the phy device. */ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface) @@ -591,6 +592,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, return -EIO; } + get_device(d); + /* Assume that if there is no driver, that it doesn't * exist, and we should use the genphy driver. */ @@ -636,6 +639,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, return err; error: + put_device(d); module_put(bus->owner); return err; } @@ -679,6 +683,9 @@ EXPORT_SYMBOL(phy_attach); /** * phy_detach - detach a PHY device from its network device * @phydev: target phy_device struct + * + * This detaches the phy device from its network device and the phy + * driver, and drops the reference count taken in phy_attach_direct(). */ void phy_detach(struct phy_device *phydev) { @@ -701,8 +708,13 @@ void phy_detach(struct phy_device *phydev) } } + /* + * The phydev might go away on the put_device() below, so avoid + * a use-after-free bug by reading the underlying bus first. + */ bus = phydev->bus; + put_device(&phydev->dev); module_put(bus->owner); } EXPORT_SYMBOL(phy_detach); From f018ae7a8c576345d56a0cd40d86c0574a2eb360 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:13 +0100 Subject: [PATCH 107/116] of_mdio: fix MDIO phy device refcounting bus_find_device() is defined as: * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. and it does indeed return a reference-counted pointer to the device: while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) ^^^^^^^^^^^^^^^ break; klist_iter_exit(&i); return dev; What that means is that when we're done with the struct device, we must drop that reference. Neither of_phy_connect() nor of_phy_attach() did this when phy_connect_direct() or phy_attach_direct() failed. With our previous patch, phy_connect_direct() and phy_attach_direct() take a new refcount on the phy device when successful, so we can drop our local reference immediatley after these functions, whether or not they succeeded. Signed-off-by: Russell King Acked-by: Rob Herring Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 1350fa25cdb0..a87a868fed64 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -197,7 +197,8 @@ static int of_phy_match(struct device *dev, void *phy_np) * of_phy_find_device - Give a PHY node, find the phy_device * @phy_np: Pointer to the phy's device tree node * - * Returns a pointer to the phy_device. + * If successful, returns a pointer to the phy_device with the embedded + * struct device refcount incremented by one, or NULL on failure. */ struct phy_device *of_phy_find_device(struct device_node *phy_np) { @@ -217,7 +218,9 @@ EXPORT_SYMBOL(of_phy_find_device); * @hndlr: Link state callback for the network device * @iface: PHY data interface type * - * Returns a pointer to the phy_device if successful. NULL otherwise + * If successful, returns a pointer to the phy_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped by calling phy_disconnect() or phy_detach(). */ struct phy_device *of_phy_connect(struct net_device *dev, struct device_node *phy_np, @@ -225,13 +228,19 @@ struct phy_device *of_phy_connect(struct net_device *dev, phy_interface_t iface) { struct phy_device *phy = of_phy_find_device(phy_np); + int ret; if (!phy) return NULL; phy->dev_flags = flags; - return phy_connect_direct(dev, phy, hndlr, iface) ? NULL : phy; + ret = phy_connect_direct(dev, phy, hndlr, iface); + + /* refcount is held by phy_connect_direct() on success */ + put_device(&phy->dev); + + return ret ? NULL : phy; } EXPORT_SYMBOL(of_phy_connect); @@ -241,17 +250,27 @@ EXPORT_SYMBOL(of_phy_connect); * @phy_np: Node pointer for the PHY * @flags: flags to pass to the PHY * @iface: PHY data interface type + * + * If successful, returns a pointer to the phy_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped by calling phy_disconnect() or phy_detach(). */ struct phy_device *of_phy_attach(struct net_device *dev, struct device_node *phy_np, u32 flags, phy_interface_t iface) { struct phy_device *phy = of_phy_find_device(phy_np); + int ret; if (!phy) return NULL; - return phy_attach_direct(dev, phy, flags, iface) ? NULL : phy; + ret = phy_attach_direct(dev, phy, flags, iface); + + /* refcount is held by phy_attach_direct() on success */ + put_device(&phy->dev); + + return ret ? NULL : phy; } EXPORT_SYMBOL(of_phy_attach); From 04d53b20fe44afe635b3d4438b437f7a12927e9a Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:18 +0100 Subject: [PATCH 108/116] net: fix phy refcounting in a bunch of drivers of_phy_find_device() increments the phy struct device refcount, which we need to properly balance. Add code to network drivers using this function to ensure that the struct device refcount is correctly balanced. For xgene, looking back in the history, we should be able to use of_phy_connect() with a zero flags argument for the DT case as this is how the driver used to operate prior to de7b5b3d790a ("net: eth: xgene: change APM X-Gene SoC platform ethernet to support ACPI"). This leaves the Cavium Thunder BGX unfixed; fixing this driver is a complicated task, one which the maintainers need to be involved with. Signed-off-by: Russell King Signed-off-by: David S. Miller --- .../net/ethernet/apm/xgene/xgene_enet_hw.c | 24 ++++++++++++------- drivers/net/ethernet/freescale/gianfar.c | 3 +++ drivers/net/ethernet/freescale/ucc_geth.c | 8 ++++++- drivers/net/ethernet/marvell/mvneta.c | 2 ++ drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 ++ 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index cfa37041ab71..c4bb8027b3fb 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -689,16 +689,24 @@ static int xgene_enet_phy_connect(struct net_device *ndev) netdev_dbg(ndev, "No phy-handle found in DT\n"); return -ENODEV; } - pdata->phy_dev = of_phy_find_device(phy_np); - } - phy_dev = pdata->phy_dev; + phy_dev = of_phy_connect(ndev, phy_np, &xgene_enet_adjust_link, + 0, pdata->phy_mode); + if (!phy_dev) { + netdev_err(ndev, "Could not connect to PHY\n"); + return -ENODEV; + } - if (!phy_dev || - phy_connect_direct(ndev, phy_dev, &xgene_enet_adjust_link, - pdata->phy_mode)) { - netdev_err(ndev, "Could not connect to PHY\n"); - return -ENODEV; + pdata->phy_dev = phy_dev; + } else { + phy_dev = pdata->phy_dev; + + if (!phy_dev || + phy_connect_direct(ndev, phy_dev, &xgene_enet_adjust_link, + pdata->phy_mode)) { + netdev_err(ndev, "Could not connect to PHY\n"); + return -ENODEV; + } } pdata->phy_speed = SPEED_UNKNOWN; diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 803ed4c93503..a5cf4332d307 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1702,6 +1702,7 @@ static void gfar_configure_serdes(struct net_device *dev) tbiphy = of_phy_find_device(priv->tbi_node); if (!tbiphy) { dev_err(&dev->dev, "error: Could not get TBI device\n"); + put_device(&tbiphy->dev); return; } @@ -1723,6 +1724,8 @@ static void gfar_configure_serdes(struct net_device *dev) phy_write(tbiphy, MII_BMCR, BMCR_ANENABLE | BMCR_ANRESTART | BMCR_FULLDPLX | BMCR_SPEED1000); + + put_device(&tbiphy->dev); } static int __gfar_is_rx_idle(struct gfar_private *priv) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 4dd40e057f40..650f7888e32b 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1384,6 +1384,8 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) value = phy_read(tbiphy, ENET_TBI_MII_CR); value &= ~0x1000; /* Turn off autonegotiation */ phy_write(tbiphy, ENET_TBI_MII_CR, value); + + put_device(&tbiphy->dev); } init_check_frame_length_mode(ug_info->lengthCheckRx, &ug_regs->maccfg2); @@ -1702,8 +1704,10 @@ static void uec_configure_serdes(struct net_device *dev) * everything for us? Resetting it takes the link down and requires * several seconds for it to come back. */ - if (phy_read(tbiphy, ENET_TBI_MII_SR) & TBISR_LSTATUS) + if (phy_read(tbiphy, ENET_TBI_MII_SR) & TBISR_LSTATUS) { + put_device(&tbiphy->dev); return; + } /* Single clk mode, mii mode off(for serdes communication) */ phy_write(tbiphy, ENET_TBI_MII_ANA, TBIANA_SETTINGS); @@ -1711,6 +1715,8 @@ static void uec_configure_serdes(struct net_device *dev) phy_write(tbiphy, ENET_TBI_MII_TBICON, TBICON_CLK_SELECT); phy_write(tbiphy, ENET_TBI_MII_CR, TBICR_SETTINGS); + + put_device(&tbiphy->dev); } /* Configure the PHY for dev. diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 09ec32e33076..514df76fc70f 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3175,6 +3175,8 @@ static int mvneta_probe(struct platform_device *pdev) struct phy_device *phy = of_phy_find_device(dn); mvneta_fixed_link_update(pp, phy); + + put_device(&phy->dev); } return 0; diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 6008eee01a33..cf468c87ce57 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -828,6 +828,8 @@ static int xemaclite_mdio_setup(struct net_local *lp, struct device *dev) if (!phydev) dev_info(dev, "MDIO of the phy is not registered yet\n"); + else + put_device(&phydev->dev); return 0; } From d618bf2bfd2a095644c852ebea16f5a981f9d875 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:23 +0100 Subject: [PATCH 109/116] phy: fixed-phy: properly validate phy in fixed_phy_update_state() Validate that the phy_device passed into fixed_phy_update_state() is a fixed-phy device before walking the list of phys for a fixed phy at the same address. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/fixed_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index fb1299c6326e..e23bf5b90e17 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -220,7 +220,7 @@ int fixed_phy_update_state(struct phy_device *phydev, struct fixed_mdio_bus *fmb = &platform_fmb; struct fixed_phy *fp; - if (!phydev || !phydev->bus) + if (!phydev || phydev->bus != fmb->mii_bus) return -EINVAL; list_for_each_entry(fp, &fmb->phys, node) { From 38737e490d4ea91660d3cec83ef88c4e6d360ae4 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:28 +0100 Subject: [PATCH 110/116] phy: add phy_device_remove() Add a phy_device_remove() function to complement phy_device_register(), which undoes the effects of phy_device_register() by removing the phy device from visibility, but not freeing it. This allows these details to be moved out of the mdio bus code into the phy code where this action belongs. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar.c | 5 +++-- drivers/net/phy/mdio_bus.c | 15 ++++++++++----- drivers/net/phy/phy_device.c | 18 ++++++++++++++++++ include/linux/phy.h | 1 + 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index a5cf4332d307..710715fcb23d 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1702,7 +1702,6 @@ static void gfar_configure_serdes(struct net_device *dev) tbiphy = of_phy_find_device(priv->tbi_node); if (!tbiphy) { dev_err(&dev->dev, "error: Could not get TBI device\n"); - put_device(&tbiphy->dev); return; } @@ -1711,8 +1710,10 @@ static void gfar_configure_serdes(struct net_device *dev) * everything for us? Resetting it takes the link down and requires * several seconds for it to come back. */ - if (phy_read(tbiphy, MII_BMSR) & BMSR_LSTATUS) + if (phy_read(tbiphy, MII_BMSR) & BMSR_LSTATUS) { + put_device(&tbiphy->dev); return; + } /* Single clk mode, mii mode off(for serdes communication) */ phy_write(tbiphy, MII_TBICON, TBICON_CLK_SELECT); diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 992406624b7c..c340e412b38f 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -291,8 +291,11 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) error: while (--i >= 0) { - if (bus->phy_map[i]) - device_unregister(&bus->phy_map[i]->dev); + struct phy_device *phydev = bus->phy_map[i]; + if (phydev) { + phy_device_remove(phydev); + phy_device_free(phydev); + } } device_del(&bus->dev); return err; @@ -307,9 +310,11 @@ void mdiobus_unregister(struct mii_bus *bus) bus->state = MDIOBUS_UNREGISTERED; for (i = 0; i < PHY_MAX_ADDR; i++) { - if (bus->phy_map[i]) - device_unregister(&bus->phy_map[i]->dev); - bus->phy_map[i] = NULL; + struct phy_device *phydev = bus->phy_map[i]; + if (phydev) { + phy_device_remove(phydev); + phy_device_free(phydev); + } } device_del(&bus->dev); } diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 97a4f52addac..f761288abe66 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -383,6 +383,24 @@ int phy_device_register(struct phy_device *phydev) } EXPORT_SYMBOL(phy_device_register); +/** + * phy_device_remove - Remove a previously registered phy device from the MDIO bus + * @phydev: phy_device structure to remove + * + * This doesn't free the phy_device itself, it merely reverses the effects + * of phy_device_register(). Use phy_device_free() to free the device + * after calling this function. + */ +void phy_device_remove(struct phy_device *phydev) +{ + struct mii_bus *bus = phydev->bus; + int addr = phydev->addr; + + device_del(&phydev->dev); + bus->phy_map[addr] = NULL; +} +EXPORT_SYMBOL(phy_device_remove); + /** * phy_find_first - finds the first PHY device on the bus * @bus: the target MII bus diff --git a/include/linux/phy.h b/include/linux/phy.h index 11bce44f6d65..4a4e3a092337 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -745,6 +745,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, struct phy_c45_device_ids *c45_ids); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); +void phy_device_remove(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); From 9861f72074c77a8a065622c1be7e9c4277e600eb Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:33 +0100 Subject: [PATCH 111/116] net: fix net_device refcounting of_find_net_device_by_node() uses class_find_device() internally to lookup the corresponding network device. class_find_device() returns a reference to the embedded struct device, with its refcount incremented. Add a comment to the definition in net/core/net-sysfs.c indicating the need to drop this refcount, and fix the DSA code to drop this refcount when the OF-generated platform data is cleaned up and freed. Also arrange for the ref to be dropped when handling errors. Signed-off-by: Russell King Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 9 +++++++++ net/dsa/dsa.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b279077c3089..805a95a48107 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1481,6 +1481,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index bf4ba15fb780..c59fa5d9c22c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -679,7 +679,7 @@ static int dsa_of_probe(struct device *dev) pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) { ret = -ENOMEM; - goto out_put_mdio; + goto out_put_ethernet; } dev->platform_data = pd; @@ -773,6 +773,8 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); out_put_mdio: put_device(&mdio_bus->dev); return ret; @@ -786,6 +788,7 @@ static void dsa_of_remove(struct device *dev) return; dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); kfree(pd); } #else From 357cd64c18404309aabefb82019b12773de31a12 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 00:07:17 +0100 Subject: [PATCH 112/116] phy: marvell: add link partner advertised modes Read the standard link partner advertisment registers and store it in phydev->lp_advertising, so ethtool can report this information to userspace via ethtool. Zero it as per genphy if autonegotiation is disabled. Tested with a Marvell 88E1512 PHY. Signed-off-by: Russell King Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e6897b6a8a53..5de8d5827536 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -785,6 +785,7 @@ static int marvell_read_status(struct phy_device *phydev) int adv; int err; int lpa; + int lpagb; int status = 0; /* Update the link, but return if there @@ -802,10 +803,17 @@ static int marvell_read_status(struct phy_device *phydev) if (lpa < 0) return lpa; + lpagb = phy_read(phydev, MII_STAT1000); + if (lpagb < 0) + return lpagb; + adv = phy_read(phydev, MII_ADVERTISE); if (adv < 0) return adv; + phydev->lp_advertising = mii_stat1000_to_ethtool_lpa_t(lpagb) | + mii_lpa_to_ethtool_lpa_t(lpa); + lpa &= adv; if (status & MII_M1011_PHY_STATUS_FULLDUPLEX) @@ -853,6 +861,7 @@ static int marvell_read_status(struct phy_device *phydev) phydev->speed = SPEED_10; phydev->pause = phydev->asym_pause = 0; + phydev->lp_advertising = 0; } return 0; From 21343ac21ec7d871e94e98e288f3398a4207d9c0 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 24 Sep 2015 15:46:53 +0530 Subject: [PATCH 113/116] net: via/Kconfig: GENERIC_PCI_IOMAP required if PCI not selected The builds of allmodconfig of avr32 is failing with: drivers/net/ethernet/via/via-rhine.c:1098:2: error: implicit declaration of function 'pci_iomap' [-Werror=implicit-function-declaration] drivers/net/ethernet/via/via-rhine.c:1119:2: error: implicit declaration of function 'pci_iounmap' [-Werror=implicit-function-declaration] The generic empty pci_iomap and pci_iounmap is used only if CONFIG_PCI is not defined and CONFIG_GENERIC_PCI_IOMAP is defined. Add GENERIC_PCI_IOMAP in the dependency list for VIA_RHINE as we are getting build failure when CONFIG_PCI and CONFIG_GENERIC_PCI_IOMAP both are not defined. Signed-off-by: Sudip Mukherjee Signed-off-by: David S. Miller --- drivers/net/ethernet/via/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/via/Kconfig b/drivers/net/ethernet/via/Kconfig index 2f1264b882b9..d3d094742a7e 100644 --- a/drivers/net/ethernet/via/Kconfig +++ b/drivers/net/ethernet/via/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_VIA config VIA_RHINE tristate "VIA Rhine support" - depends on (PCI || OF_IRQ) + depends on PCI || (OF_IRQ && GENERIC_PCI_IOMAP) depends on HAS_DMA select CRC32 select MII From 58a89ecaca53736aa465170530acea4f8be34ab4 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 24 Sep 2015 12:54:01 +0200 Subject: [PATCH 114/116] ppp: fix lockdep splat in ppp_dev_uninit() ppp_dev_uninit() locks all_ppp_mutex while under rtnl mutex protection. ppp_create_interface() must then lock these mutexes in that same order to avoid possible deadlock. [ 120.880011] ====================================================== [ 120.880011] [ INFO: possible circular locking dependency detected ] [ 120.880011] 4.2.0 #1 Not tainted [ 120.880011] ------------------------------------------------------- [ 120.880011] ppp-apitest/15827 is trying to acquire lock: [ 120.880011] (&pn->all_ppp_mutex){+.+.+.}, at: [] ppp_dev_uninit+0x64/0xb0 [ppp_generic] [ 120.880011] [ 120.880011] but task is already holding lock: [ 120.880011] (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x12/0x14 [ 120.880011] [ 120.880011] which lock already depends on the new lock. [ 120.880011] [ 120.880011] [ 120.880011] the existing dependency chain (in reverse order) is: [ 120.880011] [ 120.880011] -> #1 (rtnl_mutex){+.+.+.}: [ 120.880011] [] lock_acquire+0xcf/0x10e [ 120.880011] [] mutex_lock_nested+0x56/0x341 [ 120.880011] [] rtnl_lock+0x12/0x14 [ 120.880011] [] register_netdev+0x11/0x27 [ 120.880011] [] ppp_ioctl+0x289/0xc98 [ppp_generic] [ 120.880011] [] do_vfs_ioctl+0x4ea/0x532 [ 120.880011] [] SyS_ioctl+0x4e/0x7d [ 120.880011] [] entry_SYSCALL_64_fastpath+0x12/0x6f [ 120.880011] [ 120.880011] -> #0 (&pn->all_ppp_mutex){+.+.+.}: [ 120.880011] [] __lock_acquire+0xb07/0xe76 [ 120.880011] [] lock_acquire+0xcf/0x10e [ 120.880011] [] mutex_lock_nested+0x56/0x341 [ 120.880011] [] ppp_dev_uninit+0x64/0xb0 [ppp_generic] [ 120.880011] [] rollback_registered_many+0x19e/0x252 [ 120.880011] [] rollback_registered+0x29/0x38 [ 120.880011] [] unregister_netdevice_queue+0x6a/0x77 [ 120.880011] [] ppp_release+0x42/0x79 [ppp_generic] [ 120.880011] [] __fput+0xec/0x192 [ 120.880011] [] ____fput+0x9/0xb [ 120.880011] [] task_work_run+0x66/0x80 [ 120.880011] [] prepare_exit_to_usermode+0x8c/0xa7 [ 120.880011] [] syscall_return_slowpath+0xe4/0x104 [ 120.880011] [] int_ret_from_sys_call+0x25/0x9f [ 120.880011] [ 120.880011] other info that might help us debug this: [ 120.880011] [ 120.880011] Possible unsafe locking scenario: [ 120.880011] [ 120.880011] CPU0 CPU1 [ 120.880011] ---- ---- [ 120.880011] lock(rtnl_mutex); [ 120.880011] lock(&pn->all_ppp_mutex); [ 120.880011] lock(rtnl_mutex); [ 120.880011] lock(&pn->all_ppp_mutex); [ 120.880011] [ 120.880011] *** DEADLOCK *** Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion") Reported-by: Sedat Dilek Tested-by: Sedat Dilek Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 0481daf9201a..ed00446759b2 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2755,6 +2755,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit, */ dev_net_set(dev, net); + rtnl_lock(); mutex_lock(&pn->all_ppp_mutex); if (unit < 0) { @@ -2785,7 +2786,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit, ppp->file.index = unit; sprintf(dev->name, "ppp%d", unit); - ret = register_netdev(dev); + ret = register_netdevice(dev); if (ret != 0) { unit_put(&pn->units_idr, unit); netdev_err(ppp->dev, "PPP: couldn't register device %s (%d)\n", @@ -2797,6 +2798,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit, atomic_inc(&ppp_unit_count); mutex_unlock(&pn->all_ppp_mutex); + rtnl_unlock(); *retp = 0; return ppp; From 59f069789c98678710ed30a4be0daa3546ec82c7 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 25 Sep 2015 11:56:56 +0100 Subject: [PATCH 115/116] net: update docbook comment for __mdiobus_register() Update the docbook comment for __mdiobus_register() to include the new module owner argument. This resolves a warning found by the 0-day builder. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index c340e412b38f..12f44c53cc8e 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -236,11 +236,14 @@ static inline void of_mdiobus_link_phydev(struct mii_bus *mdio, #endif /** - * mdiobus_register - bring up all the PHYs on a given bus and attach them to bus + * __mdiobus_register - bring up all the PHYs on a given bus and attach them to bus * @bus: target mii_bus + * @owner: module containing bus accessor functions * * Description: Called by a bus driver to bring up all the PHYs - * on a given bus, and attach them to the bus. + * on a given bus, and attach them to the bus. Drivers should use + * mdiobus_register() rather than __mdiobus_register() unless they + * need to pass a specific owner module. * * Returns 0 on success or < 0 on error. */ From bdb06cbf77cb01911694cc9076ffa8196b7b9b61 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 24 Sep 2015 15:31:29 -0600 Subject: [PATCH 116/116] net: Fix panic in icmp_route_lookup Andrey reported a panic: [ 7249.865507] BUG: unable to handle kernel pointer dereference at 000000b4 [ 7249.865559] IP: [] icmp_route_lookup+0xaa/0x320 [ 7249.865598] *pdpt = 0000000030f7f001 *pde = 0000000000000000 [ 7249.865637] Oops: 0000 [#1] ... [ 7249.866811] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.3.0-999-generic #201509220155 [ 7249.866876] Hardware name: MSI MS-7250/MS-7250, BIOS 080014 08/02/2006 [ 7249.866916] task: c1a5ab00 ti: c1a52000 task.ti: c1a52000 [ 7249.866949] EIP: 0060:[] EFLAGS: 00210246 CPU: 0 [ 7249.866981] EIP is at icmp_route_lookup+0xaa/0x320 [ 7249.867012] EAX: 00000000 EBX: f483ba48 ECX: 00000000 EDX: f2e18a00 [ 7249.867045] ESI: 000000c0 EDI: f483ba70 EBP: f483b9ec ESP: f483b974 [ 7249.867077] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 [ 7249.867108] CR0: 8005003b CR2: 000000b4 CR3: 36ee07c0 CR4: 000006f0 [ 7249.867141] Stack: [ 7249.867165] 320310ee 00000000 00000042 320310ee 00000000 c1aeca00 f3920240 f0c69180 [ 7249.867268] f483ba04 f855058b a89b66cd f483ba44 f8962f4b 00000000 e659266c f483ba54 [ 7249.867361] 8004753c f483ba5c f8962f4b f2031140 000003c1 ffbd8fa0 c16b0e00 00000064 [ 7249.867448] Call Trace: [ 7249.867494] [] ? e1000_xmit_frame+0x87b/0xdc0 [e1000e] [ 7249.867534] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867576] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867615] [] ? icmp_send+0xa0/0x380 [ 7249.867648] [] icmp_send+0x2cf/0x380 [ 7249.867681] [] nf_send_unreach+0xa6/0xc0 [nf_reject_ipv4] [ 7249.867714] [] reject_tg+0x7a/0x9f [ipt_REJECT] [ 7249.867746] [] ipt_do_table+0x317/0x70c [ip_tables] [ 7249.867780] [] ? __nf_conntrack_find_get+0x166/0x3b0 [nf_conntrack] [ 7249.867838] [] ? nf_conntrack_in+0x398/0x600 [nf_conntrack] [ 7249.867889] [] iptable_filter_hook+0x35/0x80 [iptable_filter] [ 7249.867933] [] nf_iterate+0x71/0x80 [ 7249.867970] [] nf_hook_slow+0x65/0xc0 [ 7249.868002] [] __ip_local_out_sk+0xc1/0xd0 [ 7249.868034] [] ? ip_forward_options+0x1a0/0x1a0 [ 7249.868066] [] ip_local_out_sk+0x16/0x30 [ 7249.868097] [] ip_send_skb+0x14/0x80 [ 7249.868129] [] ip_push_pending_frames+0x34/0x40 [ 7249.868163] [] ip_send_unicast_reply+0x282/0x310 [ 7249.868196] [] tcp_v4_send_reset+0x1b3/0x380 [ 7249.868227] [] tcp_v4_rcv+0x323/0x990 [ 7249.868257] [] ? nf_iterate+0x71/0x80 [ 7249.868289] [] ip_local_deliver_finish+0x8b/0x230 [ 7249.868322] [] ip_local_deliver+0x4c/0xa0 [ 7249.868353] [] ? ip_rcv_finish+0x390/0x390 [ 7249.868384] [] ip_rcv_finish+0x7c/0x390 [ 7249.868415] [] ip_rcv+0x2e0/0x420 ... Prior to the VRF change the oif was not set in the flow struct, so the VRF support should really have only added the vrf_master_ifindex lookup. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Cc: Andrey Melnikov Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79fe05befcae..e5eb8ac4089d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -461,7 +461,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4);