From bf798657eb5ba57552096843c315f096fdf9b715 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 12 Aug 2015 17:41:00 +0200 Subject: [PATCH 01/65] netfilter: nf_tables: Use 32 bit addressing register from nft_type_to_reg() nft_type_to_reg() needs to return the register in the new 32 bit addressing, otherwise we hit EINVAL when using mappings. Fixes: 49499c3 ("netfilter: nf_tables: switch registers to 32 bit addressing") Reported-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2a246680a6c3..aa8bee72c9d3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -125,7 +125,7 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { - return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1; + return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } unsigned int nft_parse_register(const struct nlattr *attr); From 18e1db67e93ed75d9dc0d34c8d783ccf10547c2b Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 13 Aug 2015 08:58:15 +0200 Subject: [PATCH 02/65] netfilter: bridge: fix IPv6 packets not being bridged with CONFIG_IPV6=n 230ac490f7fba introduced a dependency to CONFIG_IPV6 which breaks bridging of IPv6 packets on a bridge with CONFIG_IPV6=n. Sysctl entry /proc/sys/net/bridge/bridge-nf-call-ip6tables defaults to 1, for this reason packets are handled by br_nf_pre_routing_ipv6(). When compiled with CONFIG_IPV6=n this function returns NF_DROP but should return NF_ACCEPT to let packets through. Change CONFIG_IPV6=n br_nf_pre_routing_ipv6() return value to NF_ACCEPT. Tested with a simple bridge with two interfaces and IPv6 packets trying to pass from host on left side to host on right side of the bridge. Fixes: 230ac490f7fba ("netfilter: bridge: split ipv6 code into separated file") Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/br_netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index bab824bde92c..d4c6b5f30acd 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -59,7 +59,7 @@ static inline unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - return NF_DROP; + return NF_ACCEPT; } #endif From 6fe7ccfd77415a6ba250c10c580eb3f9acf79753 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 25 Aug 2015 11:17:51 +0200 Subject: [PATCH 03/65] netfilter: ipset: Out of bound access in hash:net* types fixed Dave Jones reported that KASan detected out of bounds access in hash:net* types: [ 23.139532] ================================================================== [ 23.146130] BUG: KASan: out of bounds access in hash_net4_add_cidr+0x1db/0x220 at addr ffff8800d4844b58 [ 23.152937] Write of size 4 by task ipset/457 [ 23.159742] ============================================================================= [ 23.166672] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 23.173641] ----------------------------------------------------------------------------- [ 23.194668] INFO: Allocated in hash_net_create+0x16a/0x470 age=7 cpu=1 pid=456 [ 23.201836] __slab_alloc.constprop.66+0x554/0x620 [ 23.208994] __kmalloc+0x2f2/0x360 [ 23.216105] hash_net_create+0x16a/0x470 [ 23.223238] ip_set_create+0x3e6/0x740 [ 23.230343] nfnetlink_rcv_msg+0x599/0x640 [ 23.237454] netlink_rcv_skb+0x14f/0x190 [ 23.244533] nfnetlink_rcv+0x3f6/0x790 [ 23.251579] netlink_unicast+0x272/0x390 [ 23.258573] netlink_sendmsg+0x5a1/0xa50 [ 23.265485] SYSC_sendto+0x1da/0x2c0 [ 23.272364] SyS_sendto+0xe/0x10 [ 23.279168] entry_SYSCALL_64_fastpath+0x12/0x6f The bug is fixed in the patch and the testsuite is extended in ipset to check cidr handling more thoroughly. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index afe905c208af..691b54fcaf2a 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -152,9 +152,13 @@ htable_bits(u32 hashsize) #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 +/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) (SET_HOST_MASK(family) + 1) +#define CIDR_POS(c) ((c) - 1) #else +/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) SET_HOST_MASK(family) +#define CIDR_POS(c) ((c) - 2) #endif #else @@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) } else if (h->nets[i].cidr[n] < cidr) { j = i; } else if (h->nets[i].cidr[n] == cidr) { - h->nets[cidr - 1].nets[n]++; + h->nets[CIDR_POS(cidr)].nets[n]++; return; } } @@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; - h->nets[cidr - 1].nets[n] = 1; + h->nets[CIDR_POS(cidr)].nets[n] = 1; } static void @@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) for (i = 0; i < nets_length; i++) { if (h->nets[i].cidr[n] != cidr) continue; - h->nets[cidr - 1].nets[n]--; - if (h->nets[cidr - 1].nets[n] > 0) + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; From 96be5f2806cd65a2ebced3bfcdf7df0116e6c4a6 Mon Sep 17 00:00:00 2001 From: Elad Raz Date: Sat, 22 Aug 2015 08:44:11 +0300 Subject: [PATCH 04/65] netfilter: ipset: Fixing unnamed union init In continue to proposed Vinson Lee's post [1], this patch fixes compilation issues founded at gcc 4.4.7. The initialization of .cidr field of unnamed unions causes compilation error in gcc 4.4.x. References Visible links [1] https://lkml.org/lkml/2015/7/5/74 Signed-off-by: Elad Raz Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_netnet.c | 20 ++++++++++++++++++-- net/netfilter/ipset/ip_set_hash_netportnet.c | 20 ++++++++++++++++++-- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3c862c0a76d1..a93dfebffa81 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netnet4_init(struct hash_netnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; @@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netnet6_init(struct hash_netnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0c68734f5cc4..9a14c237830f 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netportnet4_init(struct hash_netportnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; @@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netportnet6_init(struct hash_netportnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || From a9de9777d613500b089a7416f936bf3ae5f070d2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 28 Aug 2015 21:01:43 +0200 Subject: [PATCH 05/65] netfilter: nfnetlink: work around wrong endianess in res_id field The convention in nfnetlink is to use network byte order in every header field as well as in the attribute payload. The initial version of the batching infrastructure assumes that res_id comes in host byte order though. The only client of the batching infrastructure is nf_tables, so let's add a workaround to address this inconsistency. We currently have 11 nfnetlink subsystems according to NFNL_SUBSYS_COUNT, so we can assume that the subsystem 2560, ie. htons(10), will not be allocated anytime soon, so it can be an alias of nf_tables from the nfnetlink batching path when interpreting the res_id field. Based on original patch from Florian Westphal. Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 0c0e8ecf02ab..70277b11f742 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -444,6 +444,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); + u_int16_t res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || @@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); - nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + nfnetlink_rcv_batch(skb, nlh, res_id); } else { netlink_rcv_skb(skb, &nfnetlink_rcv_msg); } From 9cf94eab8b309e8bcc78b41dd1561c75b537dd0b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 19:11:02 +0200 Subject: [PATCH 06/65] netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") migrated templates to the new allocator api, but forgot to update error paths for them in CT and synproxy to use nf_ct_tmpl_free() instead of nf_conntrack_free(). Due to that, memory is being freed into the wrong kmemcache, but also we drop the per net reference count of ct objects causing an imbalance. In Brad's case, this leads to a wrap-around of net->ct.count and thus lets __nf_conntrack_alloc() refuse to create a new ct object: [ 10.340913] xt_addrtype: ipv6 does not support BROADCAST matching [ 10.810168] nf_conntrack: table full, dropping packet [ 11.917416] r8169 0000:07:00.0 eth0: link up [ 11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready [ 12.815902] nf_conntrack: table full, dropping packet [ 15.688561] nf_conntrack: table full, dropping packet [ 15.689365] nf_conntrack: table full, dropping packet [ 15.690169] nf_conntrack: table full, dropping packet [ 15.690967] nf_conntrack: table full, dropping packet [...] With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs. nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus, to fix the problem, export and use nf_ct_tmpl_free() instead. Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") Reported-by: Brad Jackson Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 3 ++- net/netfilter/nf_synproxy_core.c | 2 +- net/netfilter/xt_CT.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 37cd3911d5c5..4023c4ce260f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -292,6 +292,7 @@ extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +void nf_ct_tmpl_free(struct nf_conn *tmpl); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3c20d02aee73..0625a42df108 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -320,12 +320,13 @@ out_free: } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); -static void nf_ct_tmpl_free(struct nf_conn *tmpl) +void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); kfree(tmpl); } +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_conntrack(struct nf_conntrack *nfct) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index d7f168527903..d6ee8f8b19b6 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -378,7 +378,7 @@ static int __net_init synproxy_net_init(struct net *net) err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 43ddeee404e9..f3377ce1ff18 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -233,7 +233,7 @@ out: return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: From 4548a697e4969d695047cebd6d9af5e2f6cc728e Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 2 Sep 2015 17:49:29 +0900 Subject: [PATCH 07/65] net: eth: altera: fix napi poll_list corruption tse_poll() calls __napi_complete() with irq enabled. This leads napi poll_list corruption and may stop all napi drivers working. Use napi_complete() instead of __napi_complete(). Signed-off-by: Atsushi Nemoto Signed-off-by: David S. Miller --- drivers/net/ethernet/altera/altera_tse_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index da48e66377b5..8207877d6237 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -511,8 +511,7 @@ static int tse_poll(struct napi_struct *napi, int budget) if (rxcomplete < budget) { - napi_gro_flush(napi, false); - __napi_complete(napi); + napi_complete(napi); netdev_dbg(priv->dev, "NAPI Complete, did %d packets with budget %d\n", From d82f0f1fc8a4f214a50c9dfc64e3896f9894afb7 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 2 Sep 2015 16:20:21 -0300 Subject: [PATCH 08/65] sctp: fix dst leak Commit 0ca50d12fe46 failed to release the reference to dst entries that it decided to skip. Fixes: 0ca50d12fe46 ("sctp: fix src address selection if using secondary addresses") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4345790ad326..4abf94d4cce7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -511,8 +511,10 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); - if (!odev || odev->ifindex != fl4->flowi4_oif) + if (!odev || odev->ifindex != fl4->flowi4_oif) { + dst_release(&rt->dst); continue; + } dst = &rt->dst; break; From 410f03831c0768f2b1850d28ba697b167ddcb89b Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 2 Sep 2015 16:20:22 -0300 Subject: [PATCH 09/65] sctp: add routing output fallback Commit 0ca50d12fe46 added a restriction that the address must belong to the output interface, so that sctp will use the right interface even when using secondary addresses. But it breaks IPVS setups, on which people is used to attach VIP addresses to loopback interface on real servers. It's preferred to attach to the interface actually in use, but it's a very common setup and that used to work. This patch then saves the first routing good result, even if it would be going out through an interface that doesn't have that address. If no better hit found, it's then used. This effectively restores the original behavior if no better interface could be found. Fixes: 0ca50d12fe46 ("sctp: fix src address selection if using secondary addresses") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4abf94d4cce7..b7143337e4fa 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -506,16 +506,22 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (IS_ERR(rt)) continue; + if (!dst) + dst = &rt->dst; + /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { - dst_release(&rt->dst); + if (&rt->dst != dst) + dst_release(&rt->dst); continue; } + if (dst != &rt->dst) + dst_release(dst); dst = &rt->dst; break; } From 98a1f8282b8c37378c1b947d661a58942331ca90 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 26 Aug 2015 12:22:14 +0200 Subject: [PATCH 10/65] mac80211: Do not use sizeof() on pointer type The rate_control_cap_mask() function takes a parameter mcs_mask, which GCC will take to be u8 * even though it was declared with a fixed size. This causes the following warning: net/mac80211/rate.c: In function 'rate_control_cap_mask': net/mac80211/rate.c:719:25: warning: 'sizeof' on array function parameter 'mcs_mask' will return size of 'u8 * {aka unsigned char *}' [-Wsizeof-array-argument] for (i = 0; i < sizeof(mcs_mask); i++) ^ net/mac80211/rate.c:684:10: note: declared here u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], ^ This can be easily fixed by using the IEEE80211_HT_MCS_MASK_LEN directly within the loop condition. Signed-off-by: Thierry Reding Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 9857693b91ec..9ce8883d5f44 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -716,7 +716,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, /* Filter out rates that the STA does not support */ *mask &= sta->supp_rates[sband->band]; - for (i = 0; i < sizeof(mcs_mask); i++) + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; From 22f66895e60cfc55b92f6fa93f05bb3fbdbd0bed Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Tue, 18 Aug 2015 16:52:07 +0300 Subject: [PATCH 11/65] mac80211: protect non-HT BSS when HT TDLS traffic exists HT TDLS traffic should be protected in a non-HT BSS to avoid collisions. Therefore, when TDLS peers join/leave, check if protection is (now) needed and set the ht_operation_mode of the virtual interface according to the HT capabilities of the TDLS peer(s). This works because a non-HT BSS connection never sets (or otherwise uses) the ht_operation_mode; it just means that drivers must be aware that this field applies to all HT traffic for this virtual interface, not just the traffic within the BSS. Document that. Signed-off-by: Avri Altman Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++- net/mac80211/tdls.c | 70 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e3314e516681..bfc569498bfa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -477,7 +477,9 @@ struct ieee80211_event { * @chandef: Channel definition for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. - * This field is only valid when the channel type is one of the HT types. + * This field is only valid when the channel is a wide HT/VHT channel. + * Note that with TDLS this can be the case (channel is HT, protection must + * be used from this field) even when the BSS association isn't using HT. * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index aee701a5649e..4e202d0679b2 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } +static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + bool result = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) || + !sta->sta.ht_cap.ht_supported) + continue; + result = true; + break; + } + rcu_read_unlock(); + + return result; +} + +static void +iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_ht; + u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; + u16 opmode; + + /* Nothing to do if the BSS connection uses HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + return; + + tdls_ht = (sta && sta->sta.ht_cap.ht_supported) || + iee80211_tdls_have_ht_peers(sdata); + + opmode = sdata->vif.bss_conf.ht_operation_mode; + + if (tdls_ht) + opmode |= protection; + else + opmode &= ~protection; + + if (opmode == sdata->vif.bss_conf.ht_operation_mode) + return; + + sdata->vif.bss_conf.ht_operation_mode = opmode; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT); +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { @@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, return -ENOTSUPP; } + /* protect possible bss_conf changes and avoid concurrency in + * ieee80211_bss_info_change_notify() + */ + sdata_lock(sdata); mutex_lock(&local->mtx); tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); @@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, iee80211_tdls_recalc_chanctx(sdata); - rcu_read_lock(); + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); ret = -ENOLINK; break; } + iee80211_tdls_recalc_ht_protection(sdata, sta); + set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) || !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)); @@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, ieee80211_flush_queues(local, sdata, false); ret = sta_info_destroy_addr(sdata, peer); + + mutex_lock(&local->sta_mtx); + iee80211_tdls_recalc_ht_protection(sdata, NULL); + mutex_unlock(&local->sta_mtx); + iee80211_tdls_recalc_chanctx(sdata); break; default: @@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, &sdata->u.mgd.request_smps_work); mutex_unlock(&local->mtx); + sdata_unlock(sdata); return ret; } From 4c0778933a3d7c35a94e8c35847acd9bb59a257d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= Date: Tue, 25 Aug 2015 08:56:43 -0400 Subject: [PATCH 12/65] rfkill: Copy "all" global state to other types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When switching the state of all RFKill switches of type all we need to replicate the RFKILL_TYPE_ALL global state to all the other types global state, so it is used to initialize persistent RFKill switches on register. Signed-off-by: João Paulo Rechi Vita Acked-by: Marcel Holtmann Signed-off-by: Johannes Berg --- net/rfkill/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/rfkill/core.c b/net/rfkill/core.c index f12149a29cb1..b41e9ea2ffff 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - rfkill_global_states[type].cur = blocked; + if (type == RFKILL_TYPE_ALL) { + int i; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; + } else { + rfkill_global_states[type].cur = blocked; + } + list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; From 549cc1c560128d583698ba9a73af283fe87dbab8 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Wed, 2 Sep 2015 19:00:31 +0200 Subject: [PATCH 13/65] cfg80211: regulatory: restore proper user alpha2 restore_regulatory_settings() should restore alpha2 as computed in restore_alpha2(), not raw user_alpha2 to behave as described in the comment just above that code. This fixes endless loop of calling CRDA for "00" and "97" countries after resume from suspend on my laptop. Looks like others had the same problem, too: http://ath9k-devel.ath9k.narkive.com/knY5W6St/ath9k-and-crda-messages-in-logs https://bugs.launchpad.net/ubuntu/+source/linux/+bug/899335 https://forum.porteus.org/viewtopic.php?t=4975&p=36436 https://forums.opensuse.org/showthread.php/483356-Authentication-Regulatory-Domain-issues-ath5k-12-2 Signed-off-by: Maciej Szmigiero Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b144485946f2..2510b231451e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user) * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) - regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER); + regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); spin_lock(®_requests_lock); list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); From 52a45f38ca5998db0394e782d137595a82a08b43 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Sat, 15 Aug 2015 22:39:53 +0300 Subject: [PATCH 14/65] mac80211: avoid VHT usage with no 80MHz chans allowed Currently if 80MHz channels are not allowed for use, the VHT IE is not included in the probe request for an AP. This is not good enough if the AP is configured with the wrong regulatory and supports VHT even where prohibited or in TDLS scenarios. Mark the ifmgd with the DISABLE_VHT flag for the misbehaving-AP case, and unset VHT support from the peer-station entry for the TDLS case. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 16 ++++++++++++++++ net/mac80211/vht.c | 15 +++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 705ef1d040ed..cd7e55e08a23 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4267,6 +4267,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; + u32 i; + bool have_80mhz; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4317,6 +4319,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_cap, ht_oper, vht_oper, diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 834ccdbc74be..ff1c798921a6 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; + bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + return; + /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 From ef9be10c8c999e00b239eec24cf01952a308f8e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 28 Aug 2015 10:44:20 +0200 Subject: [PATCH 15/65] mac80211: reject software RSSI CQM with beacon filtering When beacon filtering is enabled the mac80211 software implementation for RSSI CQM cannot work as beacons will not be available. Rather than accepting such a configuration without proper effect, reject it. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 685ec13ed7c2..17b1fe961c5d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2468,6 +2468,10 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, rssi_hyst == bss_conf->cqm_rssi_hyst) return 0; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && + !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + return -EOPNOTSUPP; + bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; From 25b4a44c19c83d98e8c0807a7ede07c1f28eab8b Mon Sep 17 00:00:00 2001 From: Richard Laing Date: Thu, 3 Sep 2015 13:52:31 +1200 Subject: [PATCH 16/65] net/ipv6: Correct PIM6 mrt_lock handling In the IPv6 multicast routing code the mrt_lock was not being released correctly in the MFC iterator, as a result adding or deleting a MIF would cause a hang because the mrt_lock could not be acquired. This fix is a copy of the code for the IPv4 case and ensures that the lock is released correctly. Signed-off-by: Richard Laing Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 74ceb73c1c9a..5f36266b1f5e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -550,7 +550,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mrt->mfc6_cache_array) + else if (it->cache == &mrt->mfc6_cache_array[it->ct]) read_unlock(&mrt_lock); } From bd516bd1feeb3890502178a74228ec05271f2b6d Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 3 Sep 2015 11:30:30 +0530 Subject: [PATCH 17/65] net: wan: sbni: fix device usage count dev_get_by_name() will increment the usage count if the matching device is found. But we were not decrementing the count if we have got the device and the device is non-active. Signed-off-by: Sudip Mukherjee Signed-off-by: David S. Miller --- drivers/net/wan/sbni.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 758c4ba1e97c..8fef8d83436d 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -1358,6 +1358,8 @@ sbni_ioctl( struct net_device *dev, struct ifreq *ifr, int cmd ) if( !slave_dev || !(slave_dev->flags & IFF_UP) ) { netdev_err(dev, "trying to enslave non-active device %s\n", slave_name); + if (slave_dev) + dev_put(slave_dev); return -EPERM; } From b65888796b72b677928527b17eeb8e935b767146 Mon Sep 17 00:00:00 2001 From: Sathya Perla Date: Thu, 3 Sep 2015 07:41:53 -0400 Subject: [PATCH 18/65] be2net: Revert "make the RX_FILTER command asynchronous" commit The be_cmd_rx_filter() routine sends a non-embedded cmd to the FW and used a pre-allocated dma memory to hold the cmd payload. This worked fine when this cmd was synchronous. This cmd was changed to asynchronous mode by the commit 8af65c2f4("make the RX_FILTER command asynchronous"). So now when there are two quick invocations of this cmd, the 2nd request may end up overwriting the first request, causing FW cmd corruption. This patch reverts the offending commit and hence fixes the regression. Fixes: 8af65c2f4("be2net: make the RX_FILTER command asynchronous") Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_cmds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 3be1fbdcdd02..eb323913cd39 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -1968,7 +1968,7 @@ static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value) memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN); } - status = be_mcc_notify(adapter); + status = be_mcc_notify_wait(adapter); err: spin_unlock_bh(&adapter->mcc_lock); return status; From 0890cf6cb6ab1af650025670b1a839671a9a3fcb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Sep 2015 14:04:17 +0200 Subject: [PATCH 19/65] switchdev: fix return value of switchdev_port_fdb_dump in case of error switchdev_port_fdb_dump is used as .ndo_fdb_dump. Its return value is idx, so we cannot return errval. Fixes: 45d4122ca7cd ("switchdev: add support for fdb add/del/dump via switchdev_port_obj ops.") Signed-off-by: Jiri Pirko Acked-by: Sridhar Samudrala Acked-by: Scott Feldman Signed-off-by: David S. Miller --- net/switchdev/switchdev.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 16c1c43980a1..fda38f830a10 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -853,12 +853,8 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .cb = cb, .idx = idx, }; - int err; - - err = switchdev_port_obj_dump(dev, &dump.obj); - if (err) - return err; + switchdev_port_obj_dump(dev, &dump.obj); return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); From 42ea4457aea7aaeddf0c0b06724f297608f5e9d2 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Thu, 3 Sep 2015 21:38:30 +0200 Subject: [PATCH 20/65] net: fec: normalize return value of pm_runtime_get_sync() in MDIO write If fec MDIO write method succeeds its return value comes from call to pm_runtime_get_sync(). But pm_runtime_get_sync() can also return 1. In case of Micrel KSZ9031 PHY this value will then be returned along the call chain of phy_write() -> ksz9031_extended_write() -> ksz9031_center_flp_timing() -> ksz9031_config_init() -> phy_init_hw() -> phy_attach_direct() -> phy_connect_direct(). Then phy_connect() will cast it into a pointer using ERR_PTR(), which then fec_enet_mii_probe() will try to dereference resulting in an oops. Fix it by normalizing return value of pm_runtime_get_sync() to be zero if positive in MDIO write method. Fixes: 8fff755e9f8d ("net: fec: Ensure clocks are enabled while using mdio bus") Signed-off-by: Maciej Szmigiero Acked-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 91925e38705e..6cc334035e07 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1816,11 +1816,13 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, struct fec_enet_private *fep = bus->priv; struct device *dev = &fep->pdev->dev; unsigned long time_left; - int ret = 0; + int ret; ret = pm_runtime_get_sync(dev); if (ret < 0) return ret; + else + ret = 0; fep->mii_timeout = 0; reinit_completion(&fep->mdio_done); From 8f384c0177a03640312b9cb3638c998b32243b63 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 3 Sep 2015 16:24:52 -0400 Subject: [PATCH 21/65] RDS: rds_conn_lookup() should factor in the struct net for a match Only return a conn if the rds_conn_net(conn) matches the struct net passed to rds_conn_lookup(). Fixes: 467fa15356ac ("RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index a50e652eb269..9b2de5e67d79 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) } while (0) /* rcu read lock must be held or the connection spinlock */ -static struct rds_connection *rds_conn_lookup(struct hlist_head *head, +static struct rds_connection *rds_conn_lookup(struct net *net, + struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { @@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans) { + conn->c_trans == trans && net == rds_conn_net(conn)) { ret = conn; break; } @@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) goto new_conn; rcu_read_lock(); - conn = rds_conn_lookup(head, laddr, faddr, trans); + conn = rds_conn_lookup(net, head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're @@ -239,7 +240,7 @@ new_conn: if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) found = NULL; else - found = rds_conn_lookup(head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); From 99c79eceb152e2ac7f8a81ff55d4a810f730ec7b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 4 Sep 2015 12:47:28 +0200 Subject: [PATCH 22/65] lan78xx: Fix ladv/radv error handling in lan78xx_link_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/usb/lan78xx.c: In function ‘lan78xx_link_reset’: net/usb/lan78xx.c:1107: warning: comparison is always false due to limited range of data type net/usb/lan78xx.c:1111: warning: comparison is always false due to limited range of data type Assigning return values that can be negative error codes to "u16" variables makes them positive, ignoring the errors. Hence use "int" instead. Drop the "unlikely"s (unlikely considered harmful) and propagate the actual error values instead of overriding them to -EIO while we're at it. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 39364a45af40..a39518fc93aa 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1049,8 +1049,7 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) { struct mii_if_info *mii = &dev->mii; struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; - u16 ladv, radv; - int ret; + int ladv, radv, ret; u32 buf; /* clear PHY interrupt status */ @@ -1104,12 +1103,12 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) } ladv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_ADVERTISE); - if (unlikely(ladv < 0)) - return -EIO; + if (ladv < 0) + return ladv; radv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_LPA); - if (unlikely(radv < 0)) - return -EIO; + if (radv < 0) + return radv; netif_dbg(dev, link, dev->net, "speed: %u duplex: %d anadv: 0x%04x anlpa: 0x%04x", From 0f1b7354e0d65ad528b820a8a46c15d70954f699 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 4 Sep 2015 12:49:32 +0200 Subject: [PATCH 23/65] vxlan: Refactor vxlan_udp_encap_recv() to kill compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/net/vxlan.c: In function ‘vxlan_udp_encap_recv’: drivers/net/vxlan.c:1226: warning: ‘info’ may be used uninitialized in this function While this warning is a false positive, it can be killed easily by getting rid of the pointer intermediary and referring directly to the ip_tunnel_info structure. Signed-off-by: Geert Uytterhoeven Reviewed-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ce988fd01b34..cf8b7f0473b3 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1223,7 +1223,6 @@ drop: static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct metadata_dst *tun_dst = NULL; - struct ip_tunnel_info *info; struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; @@ -1270,8 +1269,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!tun_dst) goto drop; - info = &tun_dst->u.tun_info; - md = ip_tunnel_info_opts(info); + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); } else { memset(md, 0, sizeof(*md)); } @@ -1286,7 +1284,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) md->gbp = ntohs(gbp->policy_id); if (tun_dst) - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) md->gbp |= VXLAN_GBP_DONT_LEARN; From e5a5837da756b4826d40636239718eb8f76775fd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 4 Sep 2015 14:44:12 +0200 Subject: [PATCH 24/65] ethernet: synopsys: SYNOPSYS_DWC_ETH_QOS should depend on HAS_DMA If NO_DMA=y: ERROR: "dma_alloc_coherent" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! ERROR: "dma_free_coherent" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! ERROR: "dma_unmap_single" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! ERROR: "dma_map_page" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! ERROR: "dma_mapping_error" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! ERROR: "dma_map_single" [drivers/net/ethernet/synopsys/dwc_eth_qos.ko] undefined! Signed-off-by: Geert Uytterhoeven Acked-by: Lars Persson Signed-off-by: David S. Miller --- drivers/net/ethernet/synopsys/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/synopsys/Kconfig b/drivers/net/ethernet/synopsys/Kconfig index a8f315106742..8276ee5a7d54 100644 --- a/drivers/net/ethernet/synopsys/Kconfig +++ b/drivers/net/ethernet/synopsys/Kconfig @@ -20,7 +20,7 @@ config SYNOPSYS_DWC_ETH_QOS select PHYLIB select CRC32 select MII - depends on OF + depends on OF && HAS_DMA ---help--- This driver supports the DWC Ethernet QoS from Synopsys From 585e7e1a544c5b13b2a0014c23f3cb6622e8c995 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 4 Sep 2015 11:22:24 -0400 Subject: [PATCH 25/65] net: dsa: mv88e6171: add hardware 802.1Q support The Marvell 88E6171 switch is in the 88E6351 family, which supports 802.1Q, thus add support from the generic mv88e6xxx functions. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6171.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index d54b7400e8d8..c2daaf087761 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -117,6 +117,11 @@ struct dsa_switch_driver mv88e6171_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, + .port_pvid_get = mv88e6xxx_port_pvid_get, + .port_pvid_set = mv88e6xxx_port_pvid_set, + .port_vlan_add = mv88e6xxx_port_vlan_add, + .port_vlan_del = mv88e6xxx_port_vlan_del, + .vlan_getnext = mv88e6xxx_vlan_getnext, .port_fdb_add = mv88e6xxx_port_fdb_add, .port_fdb_del = mv88e6xxx_port_fdb_del, .port_fdb_getnext = mv88e6xxx_port_fdb_getnext, From f88f69dd17f150e2abcc7e2d95f895f2546fa381 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 4 Sep 2015 13:07:40 -0700 Subject: [PATCH 26/65] openvswitch: Remove conntrack Kconfig option. There's no particular desire to have conntrack action support in Open vSwitch as an independently configurable bit, rather just to ensure there is not a hard dependency. This exposed option doesn't accurately reflect the conntrack dependency when enabled, so simplify this by removing the option. Compile the support if NF_CONNTRACK is enabled. Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 12 +----------- net/openvswitch/Makefile | 4 +++- net/openvswitch/conntrack.h | 4 ++-- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index af7cdef42066..2a071f470d57 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,6 +5,7 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on (!NF_CONNTRACK || NF_CONNTRACK) select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -31,17 +32,6 @@ config OPENVSWITCH If unsure, say N. -config OPENVSWITCH_CONNTRACK - bool "Open vSwitch conntrack action support" - depends on OPENVSWITCH - depends on NF_CONNTRACK - default OPENVSWITCH - ---help--- - If you say Y here, then Open vSwitch module will be able to pass - packets through conntrack. - - Say N to exclude this support and reduce the binary size. - config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 5b5913b06f54..60f809085b92 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,7 +15,9 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o -openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 3cb30667a7dc..43f5dd7a5577 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -19,7 +19,7 @@ struct ovs_conntrack_info; enum ovs_key_attr; -#if defined(CONFIG_OPENVSWITCH_CONNTRACK) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) void ovs_ct_init(struct net *); void ovs_ct_exit(struct net *); bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); @@ -82,5 +82,5 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key, } static inline void ovs_ct_free_action(const struct nlattr *a) { } -#endif +#endif /* CONFIG_NF_CONNTRACK */ #endif /* ovs_conntrack.h */ From bd1a05ee98b06c9a20138c45f96ccfddf3163f93 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 3 Sep 2015 23:22:16 +0300 Subject: [PATCH 27/65] fixed_phy: pass 'irq' to fixed_phy_add() I've noticed that fixed_phy_register() ignores its 'irq' parameter instead of passing it to fixed_phy_add(). Luckily, fixed_phy_register() seems to always be called with PHY_POLL for 'irq'... :-) Fixes: a75951217472 ("net: phy: extend fixed driver with fixed_phy_register()") Signed-off-by: Sergei Shtylyov Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/fixed_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 12c7eb2c604e..fb1299c6326e 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -325,7 +325,7 @@ struct phy_device *fixed_phy_register(unsigned int irq, phy_addr = phy_fixed_addr++; spin_unlock(&phy_fixed_addr_lock); - ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio); + ret = fixed_phy_add(irq, phy_addr, status, link_gpio); if (ret < 0) return ERR_PTR(ret); From 46cdc9be0841b30ba612aec1878cb746faf280a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?fran=C3=A7ois=20romieu?= Date: Fri, 4 Sep 2015 23:05:42 +0200 Subject: [PATCH 28/65] cxgb4: fix usage of uninitialized variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c: In function ‘init_one’: drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:4579:8: warning: ‘chip’ may be used uninitialized in this function [-Wmaybe-uninitialized] chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev); ^ drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:4571:11: note: ‘chip’ was declared here int ver, chip; ^ Fixes: d86bd29e0b31 ("cxgb4/cxgb4vf: read the correct bits of PL Who Am I register") Signed-off-by: Francois Romieu Cc: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index eb22d58743e2..592a4d66169c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4568,28 +4568,23 @@ static void free_some_resources(struct adapter *adapter) static int get_chip_type(struct pci_dev *pdev, u32 pl_rev) { - int ver, chip; u16 device_id; /* Retrieve adapter's device ID */ pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id); - ver = device_id >> 12; - switch (ver) { + + switch (device_id >> 12) { case CHELSIO_T4: - chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev); - break; + return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev); case CHELSIO_T5: - chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev); - break; + return CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev); case CHELSIO_T6: - chip |= CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev); - break; + return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev); default: dev_err(&pdev->dev, "Device %d is not supported\n", device_id); - return -EINVAL; } - return chip; + return -EINVAL; } static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) From fcb0bb6aab256288a4e0a8650d26e4096ec30319 Mon Sep 17 00:00:00 2001 From: Eugene Shatokhin Date: Tue, 1 Sep 2015 17:05:33 +0300 Subject: [PATCH 29/65] usbnet: Fix a race between usbnet_stop() and the BH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The race may happen when a device (e.g. YOTA 4G LTE Modem) is unplugged while the system is downloading a large file from the Net. Hardware breakpoints and Kprobes with delays were used to confirm that the race does actually happen. The race is on skb_queue ('next' pointer) between usbnet_stop() and rx_complete(), which, in turn, calls usbnet_bh(). Here is a part of the call stack with the code where the changes to the queue happen. The line numbers are for the kernel 4.1.0: *0 __skb_unlink (skbuff.h:1517) prev->next = next; *1 defer_bh (usbnet.c:430) spin_lock_irqsave(&list->lock, flags); old_state = entry->state; entry->state = state; __skb_unlink(skb, list); spin_unlock(&list->lock); spin_lock(&dev->done.lock); __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) tasklet_schedule(&dev->bh); spin_unlock_irqrestore(&dev->done.lock, flags); *2 rx_complete (usbnet.c:640) state = defer_bh(dev, skb, &dev->rxq, state); At the same time, the following code repeatedly checks if the queue is empty and reads these values concurrently with the above changes: *0 usbnet_terminate_urbs (usbnet.c:765) /* maybe wait for deletions to finish. */ while (!skb_queue_empty(&dev->rxq) && !skb_queue_empty(&dev->txq) && !skb_queue_empty(&dev->done)) { schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); set_current_state(TASK_UNINTERRUPTIBLE); netif_dbg(dev, ifdown, dev->net, "waited for %d urb completions\n", temp); } *1 usbnet_stop (usbnet.c:806) if (!(info->flags & FLAG_AVOID_UNLINK_URBS)) usbnet_terminate_urbs(dev); As a result, it is possible, for example, that the skb is removed from dev->rxq by __skb_unlink() before the check "!skb_queue_empty(&dev->rxq)" in usbnet_terminate_urbs() is made. It is also possible in this case that the skb is added to dev->done queue after "!skb_queue_empty(&dev->done)" is checked. So usbnet_terminate_urbs() may stop waiting and return while dev->done queue still has an item. Locking in defer_bh() and usbnet_terminate_urbs() was revisited to avoid this race. Signed-off-by: Eugene Shatokhin Reviewed-by: Bjørn Mork Acked-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index e0498571ae26..b4cf10781348 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -428,12 +428,18 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, old_state = entry->state; entry->state = state; __skb_unlink(skb, list); - spin_unlock(&list->lock); - spin_lock(&dev->done.lock); + + /* defer_bh() is never called with list == &dev->done. + * spin_lock_nested() tells lockdep that it is OK to take + * dev->done.lock here with list->lock held. + */ + spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING); + __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) tasklet_schedule(&dev->bh); - spin_unlock_irqrestore(&dev->done.lock, flags); + spin_unlock(&dev->done.lock); + spin_unlock_irqrestore(&list->lock, flags); return old_state; } @@ -749,6 +755,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); /*-------------------------------------------------------------------------*/ +static void wait_skb_queue_empty(struct sk_buff_head *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); + schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } + spin_unlock_irqrestore(&q->lock, flags); +} + // precondition: never called in_interrupt static void usbnet_terminate_urbs(struct usbnet *dev) { @@ -762,14 +782,11 @@ static void usbnet_terminate_urbs(struct usbnet *dev) unlink_urbs(dev, &dev->rxq); /* maybe wait for deletions to finish. */ - while (!skb_queue_empty(&dev->rxq) - && !skb_queue_empty(&dev->txq) - && !skb_queue_empty(&dev->done)) { - schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); - } + wait_skb_queue_empty(&dev->rxq); + wait_skb_queue_empty(&dev->txq); + wait_skb_queue_empty(&dev->done); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); set_current_state(TASK_RUNNING); remove_wait_queue(&dev->wait, &wait); } From 5b902d6f97f573fde911338e5d943e6b07fac7f9 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 3 Sep 2015 23:59:50 +0100 Subject: [PATCH 30/65] device property: Don't overwrite addr when failing in device_get_mac_address The function device_get_mac_address is trying different property names in order to get the mac address. To check the return value, the variable addr (which contain the buffer pass by the caller) will be re-used. This means that if the previous property is not found, the next property will be read using a NULL buffer. Therefore it's only possible to retrieve the mac if node contains a property "mac-address". Fix it by using a temporary buffer for the return value. This has been introduced by commit 4c96b7dc0d393f12c17e0d81db15aa4a820a6ab3 "Add a matching set of device_ functions for determining mac/phy" Signed-off-by: Julien Grall Cc: Jeremy Linton Cc: David S. Miller Reviewed-by: Jeremy Linton Signed-off-by: David S. Miller --- drivers/base/property.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index ff03f2348f77..2d75366c61e0 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -611,13 +611,15 @@ static void *device_get_mac_addr(struct device *dev, */ void *device_get_mac_address(struct device *dev, char *addr, int alen) { - addr = device_get_mac_addr(dev, "mac-address", addr, alen); - if (addr) - return addr; + char *res; - addr = device_get_mac_addr(dev, "local-mac-address", addr, alen); - if (addr) - return addr; + res = device_get_mac_addr(dev, "mac-address", addr, alen); + if (res) + return res; + + res = device_get_mac_addr(dev, "local-mac-address", addr, alen); + if (res) + return res; return device_get_mac_addr(dev, "address", addr, alen); } From 39797a279d62972cd914ef580fdfacb13e508bf8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 5 Sep 2015 13:07:27 -0700 Subject: [PATCH 31/65] net: dsa: bcm_sf2: Fix ageing conditions and operation The comparison check between cur_hw_state and hw_state is currently invalid because cur_hw_state is right shifted by G_MISTP_SHIFT, while hw_state is not, so we end-up comparing bits 2:0 with bits 7:5, which is going to cause an additional aging to occur. Fix this by not shifting cur_hw_state while reading it, but instead, mask the value with the appropriately shitfted bitmask. The other problem with the fast-ageing process is that we did not set the EN_AGE_DYNAMIC bit to request the ageing to occur for dynamically learned MAC addresses. Finally, write back 0 to the FAST_AGE_CTRL register to avoid leaving spurious bits sets from one operation to the other. Fixes: 12f460f23423 ("net: dsa: bcm_sf2: add HW bridging support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 289e20443d83..9d56515f4c4d 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -418,7 +418,7 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch *ds, int port) core_writel(priv, port, CORE_FAST_AGE_PORT); reg = core_readl(priv, CORE_FAST_AGE_CTRL); - reg |= EN_AGE_PORT | FAST_AGE_STR_DONE; + reg |= EN_AGE_PORT | EN_AGE_DYNAMIC | FAST_AGE_STR_DONE; core_writel(priv, reg, CORE_FAST_AGE_CTRL); do { @@ -432,6 +432,8 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch *ds, int port) if (!timeout) return -ETIMEDOUT; + core_writel(priv, 0, CORE_FAST_AGE_CTRL); + return 0; } @@ -507,7 +509,7 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port, u32 reg; reg = core_readl(priv, CORE_G_PCTL_PORT(port)); - cur_hw_state = reg >> G_MISTP_STATE_SHIFT; + cur_hw_state = reg & (G_MISTP_STATE_MASK << G_MISTP_STATE_SHIFT); switch (state) { case BR_STATE_DISABLED: @@ -531,10 +533,12 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port, } /* Fast-age ARL entries if we are moving a port from Learning or - * Forwarding state to Disabled, Blocking or Listening state + * Forwarding (cur_hw_state) state to Disabled, Blocking or Listening + * state (hw_state) */ if (cur_hw_state != hw_state) { - if (cur_hw_state & 4 && !(hw_state & 4)) { + if (cur_hw_state >= G_MISTP_LEARN_STATE && + hw_state <= G_MISTP_LISTEN_STATE) { ret = bcm_sf2_sw_fast_age_port(ds, port); if (ret) { pr_err("%s: fast-ageing failed\n", __func__); From bf361ad38165939049a2649b1a0078f3268d4bd1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sat, 5 Sep 2015 21:27:57 -0400 Subject: [PATCH 32/65] net: bridge: check __vlan_vid_del for error Since __vlan_del can return an error code, change its inner function __vlan_vid_del to return an eventual error from switchdev_port_obj_del. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3cd8cc9e804b..5f5a02b49a99 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -117,10 +117,11 @@ out_filt: return err; } -static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, - u16 vid) +static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) { const struct net_device_ops *ops = dev->netdev_ops; + int err = 0; /* If driver uses VLAN ndo ops, use 8021q to delete vid * on device, otherwise try switchdev ops to delete vid. @@ -137,8 +138,12 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, }, }; - switchdev_port_obj_del(dev, &vlan_obj); + err = switchdev_port_obj_del(dev, &vlan_obj); + if (err == -EOPNOTSUPP) + err = 0; } + + return err; } static int __vlan_del(struct net_port_vlans *v, u16 vid) @@ -151,7 +156,11 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) if (v->port_idx) { struct net_bridge_port *p = v->parent.port; - __vlan_vid_del(p->dev, p->br, vid); + int err; + + err = __vlan_vid_del(p->dev, p->br, vid); + if (err) + return err; } clear_bit(vid, v->vlan_bitmap); From 7a577f013d6745c800a11a2911ddc9a3214e7f09 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sat, 5 Sep 2015 21:49:41 -0400 Subject: [PATCH 33/65] net: bridge: remove unnecessary switchdev include Remove the unnecessary switchdev.h include from br_netlink.c. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index af5e187553fd..ea748c93a07f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "br_private.h" From 7b9018738e43c7c7693632174c69fde83b8edc07 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 7 Sep 2015 03:15:20 +0000 Subject: [PATCH 34/65] dm9000: fix a typo Signed-off-by: Barry Song Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index c0a7813603c3..cf94b72dbacd 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1226,7 +1226,7 @@ static irqreturn_t dm9000_interrupt(int irq, void *dev_id) if (int_status & ISR_PRS) dm9000_rx(dev); - /* Trnasmit Interrupt check */ + /* Transmit Interrupt check */ if (int_status & ISR_PTS) dm9000_tx_done(dev, db); From 7845989cb4b3da1db903918c844fccb9817d34a0 Mon Sep 17 00:00:00 2001 From: Kolmakov Dmitriy Date: Mon, 7 Sep 2015 09:05:48 +0000 Subject: [PATCH 35/65] net: tipc: fix stall during bclink wakeup procedure If an attempt to wake up users of broadcast link is made when there is no enough place in send queue than it may hang up inside the tipc_sk_rcv() function since the loop breaks only after the wake up queue becomes empty. This can lead to complete CPU stall with the following message generated by RCU: INFO: rcu_sched self-detected stall on CPU { 0} (t=2101 jiffies g=54225 c=54224 q=11465) Task dump for CPU 0: tpch R running task 0 39949 39948 0x0000000a ffffffff818536c0 ffff88181fa037a0 ffffffff8106a4be 0000000000000000 ffffffff818536c0 ffff88181fa037c0 ffffffff8106d8a8 ffff88181fa03800 0000000000000001 ffff88181fa037f0 ffffffff81094a50 ffff88181fa15680 Call Trace: [] sched_show_task+0xae/0x120 [] dump_cpu_task+0x38/0x40 [] rcu_dump_cpu_stacks+0x90/0xd0 [] rcu_check_callbacks+0x3eb/0x6e0 [] ? account_system_time+0x7f/0x170 [] update_process_times+0x34/0x60 [] tick_sched_handle.isra.18+0x31/0x40 [] tick_sched_timer+0x3c/0x70 [] __run_hrtimer.isra.34+0x3d/0xc0 [] hrtimer_interrupt+0xc5/0x1e0 [] ? native_smp_send_reschedule+0x42/0x60 [] local_apic_timer_interrupt+0x34/0x60 [] smp_apic_timer_interrupt+0x3c/0x60 [] apic_timer_interrupt+0x6b/0x70 [] ? _raw_spin_unlock_irqrestore+0x9/0x10 [] __wake_up_sync_key+0x4f/0x60 [] tipc_write_space+0x31/0x40 [tipc] [] filter_rcv+0x31f/0x520 [tipc] [] ? tipc_sk_lookup+0xc9/0x110 [tipc] [] ? _raw_spin_lock_bh+0x19/0x30 [] tipc_sk_rcv+0x2dc/0x3e0 [tipc] [] tipc_bclink_wakeup_users+0x2f/0x40 [tipc] [] tipc_node_unlock+0x186/0x190 [tipc] [] ? kfree_skb+0x2c/0x40 [] tipc_rcv+0x2ac/0x8c0 [tipc] [] tipc_l2_rcv_msg+0x38/0x50 [tipc] [] __netif_receive_skb_core+0x5a3/0x950 [] __netif_receive_skb+0x13/0x60 [] netif_receive_skb_internal+0x1e/0x90 [] napi_gro_receive+0x78/0xa0 [] tg3_poll_work+0xc54/0xf40 [tg3] [] ? consume_skb+0x2c/0x40 [] tg3_poll_msix+0x41/0x160 [tg3] [] net_rx_action+0xe2/0x290 [] __do_softirq+0xda/0x1f0 [] irq_exit+0x76/0xa0 [] do_IRQ+0x55/0xf0 [] common_interrupt+0x6b/0x6b The issue occurs only when tipc_sk_rcv() is used to wake up postponed senders: tipc_bclink_wakeup_users() // wakeupq - is a queue which consists of special // messages with SOCK_WAKEUP type. tipc_sk_rcv(wakeupq) ... while (skb_queue_len(inputq)) { filter_rcv(skb) // Here the type of message is checked // and if it is SOCK_WAKEUP then // it tries to wake up a sender. tipc_write_space(sk) wake_up_interruptible_sync_poll() } After the sender thread is woke up it can gather control and perform an attempt to send a message. But if there is no enough place in send queue it will call link_schedule_user() function which puts a message of type SOCK_WAKEUP to the wakeup queue and put the sender to sleep. Thus the size of the queue actually is not changed and the while() loop never exits. The approach I proposed is to wake up only senders for which there is enough place in send queue so the described issue can't occur. Moreover the same approach is already used to wake up senders on unicast links. I have got into the issue on our product code but to reproduce the issue I changed a benchmark test application (from tipcutils/demos/benchmark) to perform the following scenario: 1. Run 64 instances of test application (nodes). It can be done on the one physical machine. 2. Each application connects to all other using TIPC sockets in RDM mode. 3. When setup is done all nodes start simultaneously send broadcast messages. 4. Everything hangs up. The issue is reproducible only when a congestion on broadcast link occurs. For example, when there are only 8 nodes it works fine since congestion doesn't occur. Send queue limit is 40 in my case (I use a critical importance level) and when 64 nodes send a message at the same moment a congestion occurs every time. Signed-off-by: Dmitry S Kolmakov Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 8b010c976b2f..41042de3ae9b 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -169,6 +169,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) } } +/** + * bclink_prepare_wakeup - prepare users for wakeup after congestion + * @bcl: broadcast link + * @resultq: queue for users which can be woken up + * Move a number of waiting users, as permitted by available space in + * the send queue, from link wait queue to specified queue for wakeup + */ +static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq) +{ + int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; + int imp, lim; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) { + imp = TIPC_SKB_CB(skb)->chain_imp; + lim = bcl->window + bcl->backlog[imp].limit; + pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; + if ((pnd[imp] + bcl->backlog[imp].len) >= lim) + continue; + skb_unlink(skb, &bcl->wakeupq); + skb_queue_tail(resultq, skb); + } +} + /** * tipc_bclink_wakeup_users - wake up pending users * @@ -177,8 +201,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) void tipc_bclink_wakeup_users(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *bcl = tn->bcl; + struct sk_buff_head resultq; - tipc_sk_rcv(net, &tn->bclink->link.wakeupq); + skb_queue_head_init(&resultq); + bclink_prepare_wakeup(bcl, &resultq); + tipc_sk_rcv(net, &resultq); } /** From d43cefcd68bbc9a67b2c0efe38eb9cf6b5170fe8 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Tue, 8 Sep 2015 18:15:41 +0900 Subject: [PATCH 36/65] net: eth: altera: Fix the initial device operstate Call netif_carrier_off() prior to register_netdev(), otherwise userspace can see incorrect link state. Signed-off-by: Atsushi Nemoto Signed-off-by: David S. Miller --- drivers/net/ethernet/altera/altera_tse_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 8207877d6237..fe644823ceaf 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -1517,6 +1517,7 @@ static int altera_tse_probe(struct platform_device *pdev) spin_lock_init(&priv->tx_lock); spin_lock_init(&priv->rxdma_irq_lock); + netif_carrier_off(ndev); ret = register_netdev(ndev); if (ret) { dev_err(&pdev->dev, "failed to register TSE net device\n"); From fd1754fb8afbd9cf4ea279d533414aa6577b7e60 Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Tue, 8 Sep 2015 16:25:39 +0530 Subject: [PATCH 37/65] cxgb4: Fix tx flit calculation In commit 0aac3f56d4a63f04 ("cxgb4: Add comment for calculate tx flits and sge length code") introduced a regression where tx flit calculation is going wrong, which can lead to data corruption, hang, stall and write-combining failure. Fixing it. Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 78f446c58422..9162746d7729 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -807,7 +807,7 @@ static inline unsigned int calc_tx_flits(const struct sk_buff *skb) * message or, if we're doing a Large Send Offload, an LSO CPL message * with an embedded TX Packet Write CPL message. */ - flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4; + flits = sgl_len(skb_shinfo(skb)->nr_frags + 1); if (skb_shinfo(skb)->gso_size) flits += (sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_lso_core) + From 2a485cf7db2815ca0d1510143d9126c4475aab39 Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Tue, 8 Sep 2015 16:25:40 +0530 Subject: [PATCH 38/65] cxgb4: Fix for write-combining stats configuration The write-combining configuration register SGE_STAT_CFG_A needs to be configured after FW initializes the adapter, else FW will reset the configuration Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 592a4d66169c..f5dcde27e402 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4719,8 +4719,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) err = -ENOMEM; goto out_free_adapter; } - t4_write_reg(adapter, SGE_STAT_CFG_A, - STATSOURCE_T5_V(7) | STATMODE_V(0)); } setup_memwin(adapter); @@ -4732,6 +4730,11 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto out_unmap_bar; + /* configure SGE_STAT_CFG_A to read WC stats */ + if (!is_t4(adapter->params.chip)) + t4_write_reg(adapter, SGE_STAT_CFG_A, + STATSOURCE_T5_V(7) | STATMODE_V(0)); + for_each_port(adapter, i) { struct net_device *netdev; From 1d5d48523900a4b0f25d6b52f1a93c84bd671186 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 8 Sep 2015 14:25:14 +0100 Subject: [PATCH 39/65] xen-netback: require fewer guest Rx slots when not using GSO Commit f48da8b14d04ca87ffcffe68829afd45f926ec6a (xen-netback: fix unlimited guest Rx internal queue and carrier flapping) introduced a regression. The PV frontend in IPXE only places 4 requests on the guest Rx ring. Since netback required at least (MAX_SKB_FRAGS + 1) slots, IPXE could not receive any packets. a) If GSO is not enabled on the VIF, fewer guest Rx slots are required for the largest possible packet. Calculate the required slots based on the maximum GSO size or the MTU. This calculation of the number of required slots relies on 1650d5455bd2 (xen-netback: always fully coalesce guest Rx packets) which present in 4.0-rc1 and later. b) Reduce the Rx stall detection to checking for at least one available Rx request. This is fine since we're predominately concerned with detecting interfaces which are down and thus have zero available Rx requests. Signed-off-by: David Vrabel Reviewed-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 10 ---------- drivers/net/xen-netback/netback.c | 23 ++++++++++++++++------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 6dc76c1e807b..a7bf74727116 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -200,11 +200,6 @@ struct xenvif_queue { /* Per-queue data for xenvif */ struct xenvif_stats stats; }; -/* Maximum number of Rx slots a to-guest packet may use, including the - * slot needed for GSO meta-data. - */ -#define XEN_NETBK_RX_SLOTS_MAX (MAX_SKB_FRAGS + 1) - enum state_bit_shift { /* This bit marks that the vif is connected */ VIF_STATUS_CONNECTED, @@ -317,11 +312,6 @@ int xenvif_dealloc_kthread(void *data); void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb); -/* Determine whether the needed number of slots (req) are available, - * and set req_event if not. - */ -bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed); - void xenvif_carrier_on(struct xenvif *vif); /* Callback from stack when TX packet can be released */ diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 42569b994ea8..b588b1a08cd4 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -149,9 +149,20 @@ static inline pending_ring_idx_t pending_index(unsigned i) return i & (MAX_PENDING_REQS-1); } -bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed) +static int xenvif_rx_ring_slots_needed(struct xenvif *vif) +{ + if (vif->gso_mask) + return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1; + else + return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); +} + +static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) { RING_IDX prod, cons; + int needed; + + needed = xenvif_rx_ring_slots_needed(queue->vif); do { prod = queue->rx.sring->req_prod; @@ -513,7 +524,7 @@ static void xenvif_rx_action(struct xenvif_queue *queue) skb_queue_head_init(&rxq); - while (xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX) + while (xenvif_rx_ring_slots_available(queue) && (skb = xenvif_rx_dequeue(queue)) != NULL) { queue->last_rx_time = jiffies; @@ -1938,8 +1949,7 @@ static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue) prod = queue->rx.sring->req_prod; cons = queue->rx.req_cons; - return !queue->stalled - && prod - cons < XEN_NETBK_RX_SLOTS_MAX + return !queue->stalled && prod - cons < 1 && time_after(jiffies, queue->last_rx_time + queue->vif->stall_timeout); } @@ -1951,14 +1961,13 @@ static bool xenvif_rx_queue_ready(struct xenvif_queue *queue) prod = queue->rx.sring->req_prod; cons = queue->rx.req_cons; - return queue->stalled - && prod - cons >= XEN_NETBK_RX_SLOTS_MAX; + return queue->stalled && prod - cons >= 1; } static bool xenvif_have_rx_work(struct xenvif_queue *queue) { return (!skb_queue_empty(&queue->rx_queue) - && xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX)) + && xenvif_rx_ring_slots_available(queue)) || (queue->vif->stall_timeout && (xenvif_rx_queue_stalled(queue) || xenvif_rx_queue_ready(queue))) From 74e98eb085889b0d2d4908f59f6e00026063014f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 8 Sep 2015 10:53:40 -0400 Subject: [PATCH 40/65] RDS: verify the underlying transport exists before creating a connection There was no verification that an underlying transport exists when creating a connection, this would cause dereferencing a NULL ptr. It might happen on sockets that weren't properly bound before attempting to send a message, which will cause a NULL ptr deref: [135546.047719] kasan: GPF could be caused by NULL-ptr deref or user memory accessgeneral protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [135546.051270] Modules linked in: [135546.051781] CPU: 4 PID: 15650 Comm: trinity-c4 Not tainted 4.2.0-next-20150902-sasha-00041-gbaa1222-dirty #2527 [135546.053217] task: ffff8800835bc000 ti: ffff8800bc708000 task.ti: ffff8800bc708000 [135546.054291] RIP: __rds_conn_create (net/rds/connection.c:194) [135546.055666] RSP: 0018:ffff8800bc70fab0 EFLAGS: 00010202 [135546.056457] RAX: dffffc0000000000 RBX: 0000000000000f2c RCX: ffff8800835bc000 [135546.057494] RDX: 0000000000000007 RSI: ffff8800835bccd8 RDI: 0000000000000038 [135546.058530] RBP: ffff8800bc70fb18 R08: 0000000000000001 R09: 0000000000000000 [135546.059556] R10: ffffed014d7a3a23 R11: ffffed014d7a3a21 R12: 0000000000000000 [135546.060614] R13: 0000000000000001 R14: ffff8801ec3d0000 R15: 0000000000000000 [135546.061668] FS: 00007faad4ffb700(0000) GS:ffff880252000000(0000) knlGS:0000000000000000 [135546.062836] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [135546.063682] CR2: 000000000000846a CR3: 000000009d137000 CR4: 00000000000006a0 [135546.064723] Stack: [135546.065048] ffffffffafe2055c ffffffffafe23fc1 ffffed00493097bf ffff8801ec3d0008 [135546.066247] 0000000000000000 00000000000000d0 0000000000000000 ac194a24c0586342 [135546.067438] 1ffff100178e1f78 ffff880320581b00 ffff8800bc70fdd0 ffff880320581b00 [135546.068629] Call Trace: [135546.069028] ? __rds_conn_create (include/linux/rcupdate.h:856 net/rds/connection.c:134) [135546.069989] ? rds_message_copy_from_user (net/rds/message.c:298) [135546.071021] rds_conn_create_outgoing (net/rds/connection.c:278) [135546.071981] rds_sendmsg (net/rds/send.c:1058) [135546.072858] ? perf_trace_lock (include/trace/events/lock.h:38) [135546.073744] ? lockdep_init (kernel/locking/lockdep.c:3298) [135546.074577] ? rds_send_drop_to (net/rds/send.c:976) [135546.075508] ? __might_fault (./arch/x86/include/asm/current.h:14 mm/memory.c:3795) [135546.076349] ? __might_fault (mm/memory.c:3795) [135546.077179] ? rds_send_drop_to (net/rds/send.c:976) [135546.078114] sock_sendmsg (net/socket.c:611 net/socket.c:620) [135546.078856] SYSC_sendto (net/socket.c:1657) [135546.079596] ? SYSC_connect (net/socket.c:1628) [135546.080510] ? trace_dump_stack (kernel/trace/trace.c:1926) [135546.081397] ? ring_buffer_unlock_commit (kernel/trace/ring_buffer.c:2479 kernel/trace/ring_buffer.c:2558 kernel/trace/ring_buffer.c:2674) [135546.082390] ? trace_buffer_unlock_commit (kernel/trace/trace.c:1749) [135546.083410] ? trace_event_raw_event_sys_enter (include/trace/events/syscalls.h:16) [135546.084481] ? do_audit_syscall_entry (include/trace/events/syscalls.h:16) [135546.085438] ? trace_buffer_unlock_commit (kernel/trace/trace.c:1749) [135546.085515] rds_ib_laddr_check(): addr 36.74.25.172 ret -99 node type -1 Acked-by: Santosh Shilimkar Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/rds/connection.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/rds/connection.c b/net/rds/connection.c index 9b2de5e67d79..49adeef8090c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -190,6 +190,12 @@ new_conn: } } + if (trans == NULL) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(-ENODEV); + goto out; + } + conn->c_trans = trans; ret = trans->conn_alloc(conn, gfp); From 592867bfabe2fcb449393ba7eb0de4f972a08c63 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Sep 2015 18:00:09 +0200 Subject: [PATCH 41/65] ebpf: fix fd refcount leaks related to maps in bpf syscall We may already have gotten a proper fd struct through fdget(), so whenever we return at the end of an map operation, we need to call fdput(). However, each map operation from syscall side first probes CHECK_ATTR() to verify that unused fields in the bpf_attr union are zero. In case of malformed input, we return with error, but the lookup to the map_fd was already performed at that time, so that we return without an corresponding fdput(). Fix it by performing an fdget() only right before bpf_map_get(). The fdget() invocation on maps in the verifier is not affected. Fixes: db20fd2b0108 ("bpf: add lookup/update/delete/iterate methods to BPF maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc9b464fefa9..35bac8e8b071 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value, *ptr; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_ptr(attr->key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; + struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *unext_key = u64_to_ptr(attr->next_key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *next_key; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; + f = fdget(ufd); map = bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); From 6b9ea5a64ed5eeb3f68f2e6fcce0ed1179801d1e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 8 Sep 2015 10:53:04 -0700 Subject: [PATCH 42/65] ipv6: fix multipath route replace error recovery Problem: The ecmp route replace support for ipv6 in the kernel, deletes the existing ecmp route too early, ie when it installs the first nexthop. If there is an error in installing the subsequent nexthops, its too late to recover the already deleted existing route leaving the fib in an inconsistent state. This patch reduces the possibility of this by doing the following: a) Changes the existing multipath route add code to a two stage process: build rt6_infos + insert them ip6_route_add rt6_info creation code is moved into ip6_route_info_create. b) This ensures that most errors are caught during building rt6_infos and we fail early c) Separates multipath add and del code. Because add needs the special two stage mode in a) and delete essentially does not care. d) In any event if the code fails during inserting a route again, a warning is printed (This should be unlikely) Before the patch: $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 /* Try replacing the route with a duplicate nexthop */ $ip -6 route change 3000:1000:1000:1000::2/128 nexthop via fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1 RTNETLINK answers: File exists $ip -6 route show /* previously added ecmp route 3000:1000:1000:1000::2 dissappears from * kernel */ After the patch: $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 /* Try replacing the route with a duplicate nexthop */ $ip -6 route change 3000:1000:1000:1000::2/128 nexthop via fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1 RTNETLINK answers: File exists $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Roopa Prabhu Reviewed-by: Nikolay Aleksandrov Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/route.c | 207 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 178 insertions(+), 29 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f45cac6f8356..34539d3b843f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1748,7 +1748,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } -int ip6_route_add(struct fib6_config *cfg) +int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) { int err; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1756,7 +1756,6 @@ int ip6_route_add(struct fib6_config *cfg) struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; - struct mx6_config mxc = { .mx = NULL, }; int addr_type; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) @@ -1981,14 +1980,9 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); - err = ip6_convert_metrics(&mxc, cfg); - if (err) - goto out; + *rt_ret = rt; - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); - - kfree(mxc.mx); - return err; + return 0; out: if (dev) dev_put(dev); @@ -1996,6 +1990,35 @@ out: in6_dev_put(idev); if (rt) dst_free(&rt->dst); + + *rt_ret = NULL; + + return err; +} + +int ip6_route_add(struct fib6_config *cfg) +{ + struct mx6_config mxc = { .mx = NULL, }; + struct rt6_info *rt = NULL; + int err; + + err = ip6_route_info_create(cfg, &rt); + if (err) + goto out; + + err = ip6_convert_metrics(&mxc, cfg); + if (err) + goto out; + + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); + + kfree(mxc.mx); + + return err; +out: + if (rt) + dst_free(&rt->dst); + return err; } @@ -2776,19 +2799,78 @@ errout: return err; } -static int ip6_route_multipath(struct fib6_config *cfg, int add) +struct rt6_nh { + struct rt6_info *rt6_info; + struct fib6_config r_cfg; + struct mx6_config mxc; + struct list_head next; +}; + +static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) +{ + struct rt6_nh *nh; + + list_for_each_entry(nh, rt6_nh_list, next) { + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, + nh->r_cfg.fc_ifindex); + } +} + +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct rt6_info *rt, struct fib6_config *r_cfg) +{ + struct rt6_nh *nh; + struct rt6_info *rtnh; + int err = -EEXIST; + + list_for_each_entry(nh, rt6_nh_list, next) { + /* check if rt6_info already exists */ + rtnh = nh->rt6_info; + + if (rtnh->dst.dev == rt->dst.dev && + rtnh->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&rtnh->rt6i_gateway, + &rt->rt6i_gateway)) + return err; + } + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; + nh->rt6_info = rt; + err = ip6_convert_metrics(&nh->mxc, r_cfg); + if (err) { + kfree(nh); + return err; + } + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->next, rt6_nh_list); + + return 0; +} + +static int ip6_route_multipath_add(struct fib6_config *cfg) { struct fib6_config r_cfg; struct rtnexthop *rtnh; + struct rt6_info *rt; + struct rt6_nh *err_nh; + struct rt6_nh *nh, *nh_safe; int remaining; int attrlen; - int err = 0, last_err = 0; + int err = 1; + int nhn = 0; + int replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); + LIST_HEAD(rt6_nh_list); remaining = cfg->fc_mp_len; -beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - /* Parse a Multipath Entry */ + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * rt6_info structs per nexthop + */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) @@ -2808,22 +2890,32 @@ beginning: if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } - err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + + err = ip6_route_info_create(&r_cfg, &rt); + if (err) + goto cleanup; + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - last_err = err; - /* If we are trying to remove a route, do not stop the - * loop when ip6_route_del() fails (because next hop is - * already gone), we should try to remove all next hops. - */ - if (add) { - /* If add fails, we should try to delete all - * next hops that have been already added. - */ - add = 0; - remaining = cfg->fc_mp_len - remaining; - goto beginning; - } + dst_free(&rt->dst); + goto cleanup; } + + rtnh = rtnh_next(rtnh, &remaining); + } + + err_nh = NULL; + list_for_each_entry(nh, &rt6_nh_list, next) { + err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* nh->rt6_info is used or freed at this point, reset to NULL*/ + nh->rt6_info = NULL; + if (err) { + if (replace && nhn) + ip6_print_replace_route_err(&rt6_nh_list); + err_nh = nh; + goto add_errout; + } + /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: @@ -2833,6 +2925,63 @@ beginning: */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + nhn++; + } + + goto cleanup; + +add_errout: + /* Delete routes that were already added */ + list_for_each_entry(nh, &rt6_nh_list, next) { + if (err_nh == nh) + break; + ip6_route_del(&nh->r_cfg); + } + +cleanup: + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + if (nh->rt6_info) + dst_free(&nh->rt6_info->dst); + if (nh->mxc.mx) + kfree(nh->mxc.mx); + list_del(&nh->next); + kfree(nh); + } + + return err; +} + +static int ip6_route_multipath_del(struct fib6_config *cfg) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 1, last_err = 0; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = ip6_route_del(&r_cfg); + if (err) + last_err = err; + rtnh = rtnh_next(rtnh, &remaining); } @@ -2849,7 +2998,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 0); + return ip6_route_multipath_del(&cfg); else return ip6_route_del(&cfg); } @@ -2864,7 +3013,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 1); + return ip6_route_multipath_add(&cfg); else return ip6_route_add(&cfg); } From 687f07156b0c99205c21aa4e2986564046d342fe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 8 Sep 2015 13:40:01 -0700 Subject: [PATCH 43/65] bpf: fix out of bounds access in verifier log when the verifier log is enabled the print_bpf_insn() is doing bpf_alu_string[BPF_OP(insn->code) >> 4] and bpf_jmp_string[BPF_OP(insn->code) >> 4] where BPF_OP is a 4-bit instruction opcode. Malformed insns can cause out of bounds access. Fix it by sizing arrays appropriately. The bug was found by clang address sanitizer with libfuzzer. Reported-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ed12e385fb75..b074b23000d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -283,7 +283,7 @@ static const char *const bpf_class_string[] = { [BPF_ALU64] = "alu64", }; -static const char *const bpf_alu_string[] = { +static const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", @@ -307,7 +307,7 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; -static const char *const bpf_jmp_string[] = { +static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", From 03679a14739a0d4c14b52ba65a69ff553bfba73b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 8 Sep 2015 20:06:41 -0700 Subject: [PATCH 44/65] net: dsa: bcm_sf2: Fix 64-bits register writes The macro to write 64-bits quantities to the 32-bits register swapped the value and offsets arguments, we want to preserve the ordering of the arguments with respect to how writel() is implemented for instance: value first, offset/base second. Fixes: 246d7f773c13 ("net: dsa: add Broadcom SF2 switch driver") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h index 22e2ebf31333..789d7b7737da 100644 --- a/drivers/net/dsa/bcm_sf2.h +++ b/drivers/net/dsa/bcm_sf2.h @@ -112,8 +112,8 @@ static inline u64 name##_readq(struct bcm_sf2_priv *priv, u32 off) \ spin_unlock(&priv->indir_lock); \ return (u64)indir << 32 | dir; \ } \ -static inline void name##_writeq(struct bcm_sf2_priv *priv, u32 off, \ - u64 val) \ +static inline void name##_writeq(struct bcm_sf2_priv *priv, u64 val, \ + u32 off) \ { \ spin_lock(&priv->indir_lock); \ reg_writel(priv, upper_32_bits(val), REG_DIR_DATA_WRITE); \ From 444c5f92ed152346aef0952316e0ea855129846c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 9 Sep 2015 11:24:29 +0200 Subject: [PATCH 45/65] net: ethoc: Remove unnecessary #ifdef CONFIG_OF For !CONFIG_OF of_get_property() is defined to always return NULL. Thus there's no need to protect the call to of_get_property() with #ifdef CONFIG_OF. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- drivers/net/ethernet/ethoc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 442410cd2ca4..a2c96fd88393 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -1132,10 +1132,6 @@ static int ethoc_probe(struct platform_device *pdev) memcpy(netdev->dev_addr, pdata->hwaddr, IFHWADDRLEN); priv->phy_id = pdata->phy_id; } else { - priv->phy_id = -1; - -#ifdef CONFIG_OF - { const uint8_t *mac; mac = of_get_property(pdev->dev.of_node, @@ -1143,8 +1139,7 @@ static int ethoc_probe(struct platform_device *pdev) NULL); if (mac) memcpy(netdev->dev_addr, mac, IFHWADDRLEN); - } -#endif + priv->phy_id = -1; } /* Check that the given MAC address is valid. If it isn't, read the From f53de1e9a4aaf8cbe08845da6f7ff26a078ac507 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 9 Sep 2015 14:20:56 +0200 Subject: [PATCH 46/65] net: ipv6: use common fib_default_rule_pref This switches IPv6 policy routing to use the shared fib_default_rule_pref() function of IPv4 and DECnet. It is also used in multicast routing for IPv4 as well as IPv6. The motivation for this patch is a complaint about iproute2 behaving inconsistent between IPv4 and IPv6 when adding policy rules: Formerly, IPv6 rules were assigned a fixed priority of 0x3FFF whereas for IPv4 the assigned priority value was decreased with each rule added. Since then all users of the default_pref field have been converted to assign the generic function fib_default_rule_pref(), fib_nl_newrule() may just use it directly instead. Therefore get rid of the function pointer altogether and make fib_default_rule_pref() static, as it's not used outside fib_rules.c anymore. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/net/fib_rules.h | 2 -- net/core/fib_rules.c | 10 +++------- net/decnet/dn_rules.c | 1 - net/ipv4/fib_rules.c | 1 - net/ipv4/ipmr.c | 1 - net/ipv6/fib6_rules.c | 6 ------ net/ipv6/ip6mr.c | 1 - 7 files changed, 3 insertions(+), 19 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4e8f804f4589..59160de702b6 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -66,7 +66,6 @@ struct fib_rules_ops { struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); - u32 (*default_pref)(struct fib_rules_ops *ops); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush @@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); -u32 fib_default_rule_pref(struct fib_rules_ops *ops); #endif diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ae8306e7c56f..bf77e3639ce0 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } rule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) + : fib_default_rule_pref(ops); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) else rule->suppress_ifgroup = -1; - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 9d66a0f72f90..295bbd6a56f2 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .configure = dn_fib_rule_configure, .compare = dn_fib_rule_compare, .fill = dn_fib_rule_fill, - .default_pref = fib_default_rule_pref, .flush_cache = dn_fib_rule_flush_cache, .nlgroup = RTNLGRP_DECnet_RULE, .policy = dn_fib_rule_policy, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 18123d50f576..f2bda9e89c61 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a2c0162c3ba..866ee89f5254 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .policy = ipmr_rule_policy, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2367a16eae58..9f777ec59a59 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -258,11 +258,6 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) -{ - return 0x3FFF; -} - static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ @@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, - .default_pref = fib6_rule_default_pref, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 5f36266b1f5e..0e004cc42a22 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .policy = ip6mr_rule_policy, From ce8e5c7035098fa5b8fea910f14be59b8cace81f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 10:38:02 +0200 Subject: [PATCH 47/65] net: cavium: liquidio: use kzalloc in setup_glist() We save a little .text and get rid of the sizeof(...) style inconsistency. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 0660deecc2c9..f683d97d7614 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -818,10 +818,9 @@ static int setup_glist(struct lio *lio) INIT_LIST_HEAD(&lio->glist); for (i = 0; i < lio->tx_qsize; i++) { - g = kmalloc(sizeof(*g), GFP_KERNEL); + g = kzalloc(sizeof(*g), GFP_KERNEL); if (!g) break; - memset(g, 0, sizeof(struct octnic_gather)); g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE); From e9b5ac277e8f8dffa28f85a065e2fd890d9e48c7 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 10:38:03 +0200 Subject: [PATCH 48/65] net: jme: use kzalloc() instead of kmalloc+memset Using kzalloc saves a tiny bit on .text. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- drivers/net/ethernet/jme.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 6e9a792097d3..060dd3922974 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -583,7 +583,7 @@ jme_setup_tx_resources(struct jme_adapter *jme) atomic_set(&txring->next_to_clean, 0); atomic_set(&txring->nr_free, jme->tx_ring_size); - txring->bufinf = kmalloc(sizeof(struct jme_buffer_info) * + txring->bufinf = kzalloc(sizeof(struct jme_buffer_info) * jme->tx_ring_size, GFP_ATOMIC); if (unlikely(!(txring->bufinf))) goto err_free_txring; @@ -592,8 +592,6 @@ jme_setup_tx_resources(struct jme_adapter *jme) * Initialize Transmit Descriptors */ memset(txring->alloc, 0, TX_RING_ALLOC_SIZE(jme->tx_ring_size)); - memset(txring->bufinf, 0, - sizeof(struct jme_buffer_info) * jme->tx_ring_size); return 0; @@ -845,7 +843,7 @@ jme_setup_rx_resources(struct jme_adapter *jme) rxring->next_to_use = 0; atomic_set(&rxring->next_to_clean, 0); - rxring->bufinf = kmalloc(sizeof(struct jme_buffer_info) * + rxring->bufinf = kzalloc(sizeof(struct jme_buffer_info) * jme->rx_ring_size, GFP_ATOMIC); if (unlikely(!(rxring->bufinf))) goto err_free_rxring; @@ -853,8 +851,6 @@ jme_setup_rx_resources(struct jme_adapter *jme) /* * Initiallize Receive Descriptors */ - memset(rxring->bufinf, 0, - sizeof(struct jme_buffer_info) * jme->rx_ring_size); for (i = 0 ; i < jme->rx_ring_size ; ++i) { if (unlikely(jme_make_new_rx_buf(jme, i))) { jme_free_rx_resources(jme); From b66a60857ca4fae5900c5d81c2ba04e657509b99 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 10:38:04 +0200 Subject: [PATCH 49/65] net: mv643xx_eth: use kzalloc The double memset is a little ugly; using kzalloc avoids it altogether. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index d52639bc491f..960169efe636 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -1859,14 +1859,11 @@ oom: return; } - mc_spec = kmalloc(0x200, GFP_ATOMIC); + mc_spec = kzalloc(0x200, GFP_ATOMIC); if (mc_spec == NULL) goto oom; mc_other = mc_spec + (0x100 >> 2); - memset(mc_spec, 0, 0x100); - memset(mc_other, 0, 0x100); - netdev_for_each_mc_addr(ha, dev) { u8 *a = ha->addr; u32 *table; From 1f0ca208531a152e1da6aa43d095fe0b2039d9ca Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 9 Sep 2015 10:38:05 +0200 Subject: [PATCH 50/65] net: qlcnic: delete redundant memsets In all cases, mbx->req.arg and mbx->rsp.arg have just been allocated using kcalloc(), so these six memsets are redundant. Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 2 -- drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 2 -- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index 5ab3adf88166..9f0bdd993955 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -918,8 +918,6 @@ int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx, mbx->req.arg = NULL; return -ENOMEM; } - memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num); - memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num); temp = adapter->ahw->fw_hal_version << 29; mbx->req.arg[0] = (type | (mbx->req.num << 16) | temp); mbx->cmd_op = type; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index 6e6f18fc5d76..a5f422f26cb4 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -73,8 +73,6 @@ int qlcnic_82xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx, mbx->req.arg = NULL; return -ENOMEM; } - memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num); - memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num); mbx->req.arg[0] = type; break; } diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c index 546cd5f1c85a..7327b729ba2e 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c @@ -729,8 +729,6 @@ static int qlcnic_sriov_alloc_bc_mbx_args(struct qlcnic_cmd_args *mbx, u32 type) mbx->req.arg = NULL; return -ENOMEM; } - memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num); - memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num); mbx->req.arg[0] = (type | (mbx->req.num << 16) | (3 << 29)); mbx->rsp.arg[0] = (type & 0xffff) | mbx->rsp.num << 16; From dfc50fcaad574e5c8c85cbc83eca1426b2413fa4 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Wed, 9 Sep 2015 18:01:08 +0300 Subject: [PATCH 51/65] stmmac: fix check for phydev being open Current check of phydev with IS_ERR(phydev) may make not much sense because of_phy_connect() returns NULL on failure instead of error value. Still for checking result of phy_connect() IS_ERR() makes perfect sense. So let's use combined check IS_ERR_OR_NULL() that covers both cases. Cc: Sergei Shtylyov Cc: Giuseppe Cavallaro Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Cc: David Miller Signed-off-by: Alexey Brodkin Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 864b476f7fd5..925f2f8659b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -837,8 +837,11 @@ static int stmmac_init_phy(struct net_device *dev) interface); } - if (IS_ERR(phydev)) { + if (IS_ERR_OR_NULL(phydev)) { pr_err("%s: Could not attach to PHY\n", dev->name); + if (!phydev) + return -ENODEV; + return PTR_ERR(phydev); } From 792aec47d59d951865cc617a97b6e6be53d4b977 Mon Sep 17 00:00:00 2001 From: "Woojung.Huh@microchip.com" Date: Wed, 9 Sep 2015 20:49:53 +0000 Subject: [PATCH 52/65] add microchip LAN88xx phy driver Add Microchip LAN88XX phy driver for phylib. Signed-off-by: Woojung Huh Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 5 ++ drivers/net/phy/Makefile | 1 + drivers/net/phy/microchip.c | 148 +++++++++++++++++++++++++++++++++++ include/linux/microchipphy.h | 73 +++++++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 drivers/net/phy/microchip.c create mode 100644 include/linux/microchipphy.h diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index c07030dbe748..c5ad98ace5d0 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -127,6 +127,11 @@ config DP83867_PHY ---help--- Currently supports the DP83867 PHY. +config MICROCHIP_PHY + tristate "Drivers for Microchip PHYs" + help + Supports the LAN88XX PHYs. + config FIXED_PHY tristate "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs" depends on PHYLIB diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 9bb103358c74..87f079c4b2c7 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_MDIO_BUS_MUX_MMIOREG) += mdio-mux-mmioreg.o obj-$(CONFIG_MDIO_SUN4I) += mdio-sun4i.o obj-$(CONFIG_MDIO_MOXART) += mdio-moxart.o obj-$(CONFIG_MDIO_BCM_UNIMAC) += mdio-bcm-unimac.o +obj-$(CONFIG_MICROCHIP_PHY) += microchip.o diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c new file mode 100644 index 000000000000..c0a20ebd083b --- /dev/null +++ b/drivers/net/phy/microchip.c @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2015 Microchip Technology + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include + +#define DRIVER_AUTHOR "WOOJUNG HUH " +#define DRIVER_DESC "Microchip LAN88XX PHY driver" + +struct lan88xx_priv { + int chip_id; + int chip_rev; + __u32 wolopts; +}; + +static int lan88xx_phy_config_intr(struct phy_device *phydev) +{ + int rc; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + /* unmask all source and clear them before enable */ + rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF); + rc = phy_read(phydev, LAN88XX_INT_STS); + rc = phy_write(phydev, LAN88XX_INT_MASK, + LAN88XX_INT_MASK_MDINTPIN_EN_ | + LAN88XX_INT_MASK_LINK_CHANGE_); + } else { + rc = phy_write(phydev, LAN88XX_INT_MASK, 0); + } + + return rc < 0 ? rc : 0; +} + +static int lan88xx_phy_ack_interrupt(struct phy_device *phydev) +{ + int rc = phy_read(phydev, LAN88XX_INT_STS); + + return rc < 0 ? rc : 0; +} + +int lan88xx_suspend(struct phy_device *phydev) +{ + struct lan88xx_priv *priv = phydev->priv; + + /* do not power down PHY when WOL is enabled */ + if (!priv->wolopts) + genphy_suspend(phydev); + + return 0; +} + +static int lan88xx_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->dev; + struct lan88xx_priv *priv; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->wolopts = 0; + + /* these values can be used to identify internal PHY */ + priv->chip_id = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_ID, + 3, phydev->addr); + priv->chip_rev = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_REV, + 3, phydev->addr); + + phydev->priv = priv; + + return 0; +} + +static void lan88xx_remove(struct phy_device *phydev) +{ + struct device *dev = &phydev->dev; + struct lan88xx_priv *priv = phydev->priv; + + if (priv) + devm_kfree(dev, priv); +} + +static int lan88xx_set_wol(struct phy_device *phydev, + struct ethtool_wolinfo *wol) +{ + struct lan88xx_priv *priv = phydev->priv; + + priv->wolopts = wol->wolopts; + + return 0; +} + +static struct phy_driver microchip_phy_driver[] = { +{ + .phy_id = 0x0007c130, + .phy_id_mask = 0xfffffff0, + .name = "Microchip LAN88xx", + + .features = (PHY_GBIT_FEATURES | + SUPPORTED_Pause | SUPPORTED_Asym_Pause), + .flags = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG, + + .probe = lan88xx_probe, + .remove = lan88xx_remove, + + .config_init = genphy_config_init, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + + .ack_interrupt = lan88xx_phy_ack_interrupt, + .config_intr = lan88xx_phy_config_intr, + + .suspend = lan88xx_suspend, + .resume = genphy_resume, + .set_wol = lan88xx_set_wol, + + .driver = { .owner = THIS_MODULE, } +} }; + +module_phy_driver(microchip_phy_driver); + +static struct mdio_device_id __maybe_unused microchip_tbl[] = { + { 0x0007c130, 0xfffffff0 }, + { } +}; + +MODULE_DEVICE_TABLE(mdio, microchip_tbl); + +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h new file mode 100644 index 000000000000..eb492d47f717 --- /dev/null +++ b/include/linux/microchipphy.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2015 Microchip Technology + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _MICROCHIPPHY_H +#define _MICROCHIPPHY_H + +#define LAN88XX_INT_MASK (0x19) +#define LAN88XX_INT_MASK_MDINTPIN_EN_ (0x8000) +#define LAN88XX_INT_MASK_SPEED_CHANGE_ (0x4000) +#define LAN88XX_INT_MASK_LINK_CHANGE_ (0x2000) +#define LAN88XX_INT_MASK_FDX_CHANGE_ (0x1000) +#define LAN88XX_INT_MASK_AUTONEG_ERR_ (0x0800) +#define LAN88XX_INT_MASK_AUTONEG_DONE_ (0x0400) +#define LAN88XX_INT_MASK_POE_DETECT_ (0x0200) +#define LAN88XX_INT_MASK_SYMBOL_ERR_ (0x0100) +#define LAN88XX_INT_MASK_FAST_LINK_FAIL_ (0x0080) +#define LAN88XX_INT_MASK_WOL_EVENT_ (0x0040) +#define LAN88XX_INT_MASK_EXTENDED_INT_ (0x0020) +#define LAN88XX_INT_MASK_RESERVED_ (0x0010) +#define LAN88XX_INT_MASK_FALSE_CARRIER_ (0x0008) +#define LAN88XX_INT_MASK_LINK_SPEED_DS_ (0x0004) +#define LAN88XX_INT_MASK_MASTER_SLAVE_DONE_ (0x0002) +#define LAN88XX_INT_MASK_RX__ER_ (0x0001) + +#define LAN88XX_INT_STS (0x1A) +#define LAN88XX_INT_STS_INT_ACTIVE_ (0x8000) +#define LAN88XX_INT_STS_SPEED_CHANGE_ (0x4000) +#define LAN88XX_INT_STS_LINK_CHANGE_ (0x2000) +#define LAN88XX_INT_STS_FDX_CHANGE_ (0x1000) +#define LAN88XX_INT_STS_AUTONEG_ERR_ (0x0800) +#define LAN88XX_INT_STS_AUTONEG_DONE_ (0x0400) +#define LAN88XX_INT_STS_POE_DETECT_ (0x0200) +#define LAN88XX_INT_STS_SYMBOL_ERR_ (0x0100) +#define LAN88XX_INT_STS_FAST_LINK_FAIL_ (0x0080) +#define LAN88XX_INT_STS_WOL_EVENT_ (0x0040) +#define LAN88XX_INT_STS_EXTENDED_INT_ (0x0020) +#define LAN88XX_INT_STS_RESERVED_ (0x0010) +#define LAN88XX_INT_STS_FALSE_CARRIER_ (0x0008) +#define LAN88XX_INT_STS_LINK_SPEED_DS_ (0x0004) +#define LAN88XX_INT_STS_MASTER_SLAVE_DONE_ (0x0002) +#define LAN88XX_INT_STS_RX_ER_ (0x0001) + +#define LAN88XX_EXT_PAGE_ACCESS (0x1F) +#define LAN88XX_EXT_PAGE_SPACE_0 (0x0000) +#define LAN88XX_EXT_PAGE_SPACE_1 (0x0001) +#define LAN88XX_EXT_PAGE_SPACE_2 (0x0002) + +/* Extended Register Page 1 space */ +#define LAN88XX_EXT_MODE_CTRL (0x13) +#define LAN88XX_EXT_MODE_CTRL_MDIX_MASK_ (0x000C) +#define LAN88XX_EXT_MODE_CTRL_AUTO_MDIX_ (0x0000) +#define LAN88XX_EXT_MODE_CTRL_MDI_ (0x0008) +#define LAN88XX_EXT_MODE_CTRL_MDI_X_ (0x000C) + +/* MMD 3 Registers */ +#define LAN88XX_MMD3_CHIP_ID (32877) +#define LAN88XX_MMD3_CHIP_REV (32878) + +#endif /* _MICROCHIPPHY_H */ From 52fe51f8523751da0e79c85350c47eb3bb94da5b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 10 Sep 2015 06:57:12 +0800 Subject: [PATCH 53/65] ipv6: fix ifnullfree.cocci warnings net/ipv6/route.c:2946:3-8: WARNING: NULL check before freeing functions like kfree, debugfs_remove, debugfs_remove_recursive or usb_free_urb is not needed. Maybe consider reorganizing relevant code to avoid passing NULL values. NULL check before some freeing functions is not needed. Based on checkpatch warning "kfree(NULL) is safe this check is probably not required" and kfreeaddr.cocci by Julia Lawall. Generated by: scripts/coccinelle/free/ifnullfree.cocci CC: Roopa Prabhu Signed-off-by: Fengguang Wu Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 34539d3b843f..53617d715188 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2942,8 +2942,7 @@ cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) dst_free(&nh->rt6_info->dst); - if (nh->mxc.mx) - kfree(nh->mxc.mx); + kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); } From d0942473e3ca4629a40bbf0c9fd74fc0c7ff2a79 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 7 Sep 2015 11:57:43 +0800 Subject: [PATCH 54/65] r8152: split DRIVER_VERSION Split DRIVER_VERSION into NETNEXT_VERSION and NET_VERSION. Then, according to the value of DRIVER_VERSION, we could know which patches are used generally without comparing the source code. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index fe4ec324aebc..6bb48bc51484 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -26,8 +26,13 @@ #include #include -/* Version Information */ -#define DRIVER_VERSION "v1.08.1 (2015/07/28)" +/* Information for net-next */ +#define NETNEXT_VERSION "08" + +/* Information for net */ +#define NET_VERSION "1" + +#define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " #define DRIVER_DESC "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters" #define MODULENAME "r8152" From 2dd49e0f16fb0e07c6fcc1322ebba310f5827072 Mon Sep 17 00:00:00 2001 From: hayeswang Date: Mon, 7 Sep 2015 11:57:44 +0800 Subject: [PATCH 55/65] r8152: fix the runtime suspend issues Fix the runtime suspend issues result from the linking change. Case 1: a) link down occurs. b) driver disable tx/rx. c) autosuspend occurs. d) hw linking up. e) device suspends without enabling tx/rx. f) couldn't wake up when receiving packets. Case 2: a) Nway results in linking down. b) autosuspend occurs. c) device suspends. d) device may not wake up when linking up. Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 59 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 6bb48bc51484..d9427ca3dba7 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -30,7 +30,7 @@ #define NETNEXT_VERSION "08" /* Information for net */ -#define NET_VERSION "1" +#define NET_VERSION "2" #define DRIVER_VERSION "v1." NETNEXT_VERSION "." NET_VERSION #define DRIVER_AUTHOR "Realtek linux nic maintainers " @@ -148,6 +148,7 @@ #define OCP_EEE_ABLE 0xa5c4 #define OCP_EEE_ADV 0xa5d0 #define OCP_EEE_LPABLE 0xa5d2 +#define OCP_PHY_STATE 0xa708 /* nway state for 8153 */ #define OCP_ADC_CFG 0xbc06 /* SRAM Register */ @@ -432,6 +433,10 @@ /* OCP_DOWN_SPEED */ #define EN_10M_BGOFF 0x0080 +/* OCP_PHY_STATE */ +#define TXDIS_STATE 0x01 +#define ABD_STATE 0x02 + /* OCP_ADC_CFG */ #define CKADSEL_L 0x0100 #define ADC_EN 0x0080 @@ -609,6 +614,7 @@ struct r8152 { void (*unload)(struct r8152 *); int (*eee_get)(struct r8152 *, struct ethtool_eee *); int (*eee_set)(struct r8152 *, struct ethtool_eee *); + bool (*in_nway)(struct r8152 *); } rtl_ops; int intr_interval; @@ -2946,6 +2952,32 @@ static void rtl8153_down(struct r8152 *tp) r8153_enable_aldps(tp); } +static bool rtl8152_in_nway(struct r8152 *tp) +{ + u16 nway_state; + + ocp_write_word(tp, MCU_TYPE_PLA, PLA_OCP_GPHY_BASE, 0x2000); + tp->ocp_base = 0x2000; + ocp_write_byte(tp, MCU_TYPE_PLA, 0xb014, 0x4c); /* phy state */ + nway_state = ocp_read_word(tp, MCU_TYPE_PLA, 0xb01a); + + /* bit 15: TXDIS_STATE, bit 14: ABD_STATE */ + if (nway_state & 0xc000) + return false; + else + return true; +} + +static bool rtl8153_in_nway(struct r8152 *tp) +{ + u16 phy_state = ocp_reg_read(tp, OCP_PHY_STATE) & 0xff; + + if (phy_state == TXDIS_STATE || phy_state == ABD_STATE) + return false; + else + return true; +} + static void set_carrier(struct r8152 *tp) { struct net_device *netdev = tp->netdev; @@ -3410,6 +3442,27 @@ static int rtl8152_post_reset(struct usb_interface *intf) return 0; } +static bool delay_autosuspend(struct r8152 *tp) +{ + bool sw_linking = !!netif_carrier_ok(tp->netdev); + bool hw_linking = !!(rtl8152_get_speed(tp) & LINK_STATUS); + + /* This means a linking change occurs and the driver doesn't detect it, + * yet. If the driver has disabled tx/rx and hw is linking on, the + * device wouldn't wake up by receiving any packet. + */ + if (work_busy(&tp->schedule.work) || sw_linking != hw_linking) + return true; + + /* If the linking down is occurred by nway, the device may miss the + * linking change event. And it wouldn't wake when linking on. + */ + if (!sw_linking && tp->rtl_ops.in_nway(tp)) + return true; + else + return false; +} + static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) { struct r8152 *tp = usb_get_intfdata(intf); @@ -3419,7 +3472,7 @@ static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message) mutex_lock(&tp->control); if (PMSG_IS_AUTO(message)) { - if (netif_running(netdev) && work_busy(&tp->schedule.work)) { + if (netif_running(netdev) && delay_autosuspend(tp)) { ret = -EBUSY; goto out1; } @@ -4049,6 +4102,7 @@ static int rtl_ops_init(struct r8152 *tp) ops->unload = rtl8152_unload; ops->eee_get = r8152_get_eee; ops->eee_set = r8152_set_eee; + ops->in_nway = rtl8152_in_nway; break; case RTL_VER_03: @@ -4063,6 +4117,7 @@ static int rtl_ops_init(struct r8152 *tp) ops->unload = rtl8153_unload; ops->eee_get = r8153_get_eee; ops->eee_set = r8153_set_eee; + ops->in_nway = rtl8153_in_nway; break; default: From 9638d19e481605217f95d9ab3c8896e499b1407d Mon Sep 17 00:00:00 2001 From: Nimrod Andy Date: Thu, 10 Sep 2015 09:35:39 +0800 Subject: [PATCH 56/65] net: fec: add netif status check before set mac address There exist one issue by below case that case system hang: ifconfig eth0 down ifconfig eth0 hw ether 00:10:19:19:81:19 After eth0 down, all fec clocks are gated off. In the .fec_set_mac_address() function, it will set new MAC address to registers, which causes system hang. So it needs to add netif status check to avoid registers access when clocks are gated off. Until eth0 up the new MAC address are wrote into related registers. V2: As Lucas Stach's suggestion, add a comment in the code to explain why it needed. CC: Lucas Stach CC: Florian Fainelli Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 6cc334035e07..dd4ca39d5d8f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3031,6 +3031,14 @@ fec_set_mac_address(struct net_device *ndev, void *p) memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len); } + /* Add netif status check here to avoid system hang in below case: + * ifconfig ethx down; ifconfig ethx hw ether xx:xx:xx:xx:xx:xx; + * After ethx down, fec all clocks are gated off and then register + * access causes system hang. + */ + if (!netif_running(ndev)) + return 0; + writel(ndev->dev_addr[3] | (ndev->dev_addr[2] << 8) | (ndev->dev_addr[1] << 16) | (ndev->dev_addr[0] << 24), fep->hwp + FEC_ADDR_LOW); From f2be053c83ee93888fc09d90df2bded0deb28947 Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Thu, 10 Sep 2015 09:55:13 +0530 Subject: [PATCH 57/65] cxgb4: changes for new firmware 1.14.4.0 Incorporate fw_ldst_cmd structure change for new firmware and also update version string for the same Signed-off-by: Hariprasad Shenai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 33 +++++++++++++++++-- .../net/ethernet/chelsio/cxgb4/t4fw_version.h | 12 +++---- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index ab4674684acc..a32de30ea663 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -762,8 +762,6 @@ enum fw_ldst_func_mod_index { struct fw_ldst_cmd { __be32 op_to_addrspace; -#define FW_LDST_CMD_ADDRSPACE_S 0 -#define FW_LDST_CMD_ADDRSPACE_V(x) ((x) << FW_LDST_CMD_ADDRSPACE_S) __be32 cycles_to_len16; union fw_ldst { struct fw_ldst_addrval { @@ -788,6 +786,13 @@ struct fw_ldst_cmd { __be16 vctl; __be16 rval; } mdio; + struct fw_ldst_cim_rq { + u8 req_first64[8]; + u8 req_second64[8]; + u8 resp_first64[8]; + u8 resp_second64[8]; + __be32 r3[2]; + } cim_rq; union fw_ldst_mps { struct fw_ldst_mps_rplc { __be16 fid_idx; @@ -828,9 +833,33 @@ struct fw_ldst_cmd { __be16 nset_pkd; __be32 data[12]; } pcie; + struct fw_ldst_i2c_deprecated { + u8 pid_pkd; + u8 base; + u8 boffset; + u8 data; + __be32 r9; + } i2c_deprecated; + struct fw_ldst_i2c { + u8 pid; + u8 did; + u8 boffset; + u8 blen; + __be32 r9; + __u8 data[48]; + } i2c; + struct fw_ldst_le { + __be32 index; + __be32 r9; + u8 val[33]; + u8 r11[7]; + } le; } u; }; +#define FW_LDST_CMD_ADDRSPACE_S 0 +#define FW_LDST_CMD_ADDRSPACE_V(x) ((x) << FW_LDST_CMD_ADDRSPACE_S) + #define FW_LDST_CMD_MSG_S 31 #define FW_LDST_CMD_MSG_V(x) ((x) << FW_LDST_CMD_MSG_S) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h index 92bafa793de6..c4b262ca7d43 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h @@ -36,8 +36,8 @@ #define __T4FW_VERSION_H__ #define T4FW_VERSION_MAJOR 0x01 -#define T4FW_VERSION_MINOR 0x0D -#define T4FW_VERSION_MICRO 0x20 +#define T4FW_VERSION_MINOR 0x0E +#define T4FW_VERSION_MICRO 0x04 #define T4FW_VERSION_BUILD 0x00 #define T4FW_MIN_VERSION_MAJOR 0x01 @@ -45,8 +45,8 @@ #define T4FW_MIN_VERSION_MICRO 0x00 #define T5FW_VERSION_MAJOR 0x01 -#define T5FW_VERSION_MINOR 0x0D -#define T5FW_VERSION_MICRO 0x20 +#define T5FW_VERSION_MINOR 0x0E +#define T5FW_VERSION_MICRO 0x04 #define T5FW_VERSION_BUILD 0x00 #define T5FW_MIN_VERSION_MAJOR 0x00 @@ -54,8 +54,8 @@ #define T5FW_MIN_VERSION_MICRO 0x00 #define T6FW_VERSION_MAJOR 0x01 -#define T6FW_VERSION_MINOR 0x0D -#define T6FW_VERSION_MICRO 0x2D +#define T6FW_VERSION_MINOR 0x0E +#define T6FW_VERSION_MICRO 0x04 #define T6FW_VERSION_BUILD 0x00 #define T6FW_MIN_VERSION_MAJOR 0x00 From a66e36568e30ed3714c0e3a12bd3b64696343ff5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Sep 2015 01:20:46 +0200 Subject: [PATCH 58/65] netlink, mmap: don't walk rx ring on poll if receive queue non-empty In case of netlink mmap, there can be situations where received frames have to be placed into the normal receive queue. The ring buffer indicates this through NL_MMAP_STATUS_COPY, so the user is asked to pick them up via recvmsg(2) syscall, and to put the slot back to NL_MMAP_STATUS_UNUSED. Commit 0ef707700f1c ("netlink: rx mmap: fix POLLIN condition") changed polling, so that we walk in the worst case the whole ring through the new netlink_has_valid_frame(), for example, when the ring would have no NL_MMAP_STATUS_VALID, but at least one NL_MMAP_STATUS_COPY frame. Since we do a datagram_poll() already earlier to pick up a mask that could possibly contain POLLIN | POLLRDNORM already (due to NL_MMAP_STATUS_COPY), we can skip checking the rx ring entirely. In case the kernel is compiled with !CONFIG_NETLINK_MMAP, then all this is irrelevant anyway as netlink_poll() is just defined as datagram_poll(). Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 50889be1517d..173817a5dfad 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -674,12 +674,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, mask = datagram_poll(file, sock, wait); - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; + /* We could already have received frames in the normal receive + * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, + * so if mask contains pollin/etc already, there's no point + * walking the ring. + */ + if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + if (netlink_has_valid_frame(&nlk->rx_ring)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); } - spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (nlk->tx_ring.pg_vec) { From 6bb0fef489f667cf701853054f44579754f00a06 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Sep 2015 02:10:57 +0200 Subject: [PATCH 59/65] netlink, mmap: fix edge-case leakages in nf queue zero-copy When netlink mmap on receive side is the consumer of nf queue data, it can happen that in some edge cases, we write skb shared info into the user space mmap buffer: Assume a possible rx ring frame size of only 4096, and the network skb, which is being zero-copied into the netlink skb, contains page frags with an overall skb->len larger than the linear part of the netlink skb. skb_zerocopy(), which is generic and thus not aware of the fact that shared info cannot be accessed for such skbs then tries to write and fill frags, thus leaking kernel data/pointers and in some corner cases possibly writing out of bounds of the mmap area (when filling the last slot in the ring buffer this way). I.e. the ring buffer slot is then of status NL_MMAP_STATUS_VALID, has an advertised length larger than 4096, where the linear part is visible at the slot beginning, and the leaked sizeof(struct skb_shared_info) has been written to the beginning of the next slot (also corrupting the struct nl_mmap_hdr slot header incl. status etc), since skb->end points to skb->data + ring->frame_size - NL_MMAP_HDRLEN. The fix adds and lets __netlink_alloc_skb() take the actual needed linear room for the network skb + meta data into account. It's completely irrelevant for non-mmaped netlink sockets, but in case mmap sockets are used, it can be decided whether the available skb_tailroom() is really large enough for the buffer, or whether it needs to internally fallback to a normal alloc_skb(). >From nf queue side, the information whether the destination port is an mmap RX ring is not really available without extra port-to-socket lookup, thus it can only be determined in lower layers i.e. when __netlink_alloc_skb() is called that checks internally for this. I chose to add the extra ldiff parameter as mmap will then still work: We have data_len and hlen in nfqnl_build_packet_message(), data_len is the full length (capped at queue->copy_range) for skb_zerocopy() and hlen some possible part of data_len that needs to be copied; the rem_len variable indicates the needed remaining linear mmap space. The only other workaround in nf queue internally would be after allocation time by f.e. cap'ing the data_len to the skb_tailroom() iff we deal with an mmap skb, but that would 1) expose the fact that we use a mmap skb to upper layers, and 2) trim the skb where we otherwise could just have moved the full skb into the normal receive queue. After the patch, in my test case the ring slot doesn't fit and therefore shows NL_MMAP_STATUS_COPY, where a full skb carries all the data and thus needs to be picked up via recv(). Fixes: 3ab1f683bf8b ("nfnetlink: add support for memory mapped netlink") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netlink.h | 13 +++++++++++-- net/netfilter/nfnetlink_queue_core.c | 5 +++-- net/netlink/af_netlink.c | 18 ++++++++++++------ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 9120edb650a0..639e9b8b0e4d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -68,8 +68,17 @@ extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_has_listeners(struct sock *sk, unsigned int group); -extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask); + +extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask); +static inline struct sk_buff * +netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid, + gfp_t gfp_mask) +{ + return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask); +} + extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 685cc6a17163..a5cd6d90b78b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0; + size_t data_len = 0, cap_len = 0, rem_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; + rem_len = data_len - hlen; break; } @@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, + skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 173817a5dfad..7f86d3b55060 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1844,15 +1844,16 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { #ifdef CONFIG_NETLINK_MMAP + unsigned int maxlen, linear_size; struct sock *sk = NULL; struct sk_buff *skb; struct netlink_ring *ring; struct nl_mmap_hdr *hdr; - unsigned int maxlen; sk = netlink_getsockbyportid(ssk, dst_portid); if (IS_ERR(sk)) @@ -1863,7 +1864,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; - if (ring->frame_size - NL_MMAP_HDRLEN < size) + /* We need to account the full linear size needed as a ring + * slot cannot have non-linear parts. + */ + linear_size = size + ldiff; + if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) goto out_put; skb = alloc_skb_head(gfp_mask); @@ -1877,13 +1882,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) + if (maxlen < linear_size) goto out_free; netlink_forward_ring(ring); hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); if (hdr == NULL) goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); atomic_inc(&ring->pending); @@ -1909,7 +1915,7 @@ out: #endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { From 1ab1e895492d8084dfc1c854efacde219e56b8c1 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Wed, 9 Sep 2015 12:25:17 +0200 Subject: [PATCH 60/65] ether: add IEEE 1722 ethertype - TSN IEEE 1722 describes AVB (later renamed to TSN - Time Sensitive Networking), a protocol, encapsualtion and synchronization to utilize standard networks for audio/video (and later other time-sensitive) streams. This standard uses ethertype 0x22F0. http://standards.ieee.org/develop/regauth/ethertype/eth.txt This is a respin of a previous patch ("ether: add AVB frame type ETH_P_AVB") CC: "David S. Miller" CC: netdev@vger.kernel.org CC: linux-api@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Henrik Austad Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index aa63ed023c2b..ea9221b0331a 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -42,6 +42,7 @@ #define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ #define ETH_P_PUP 0x0200 /* Xerox PUP packet */ #define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ #define ETH_P_IP 0x0800 /* Internet Protocol packet */ #define ETH_P_X25 0x0805 /* CCITT X.25 */ #define ETH_P_ARP 0x0806 /* Address Resolution packet */ From 420203204eada39cfe0e8eb65e609da7b209cf33 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Thu, 10 Sep 2015 10:47:35 +0200 Subject: [PATCH 61/65] r8169: Fix sleeping function called during get_stats64, v2 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=104031 Fixes: 6e85d5ad36a26debc23a9a865c029cbe242b2dc8 Based on the discussion starting at http://www.spinics.net/lists/netdev/msg342193.html Tested locally on RTL8168evl/8111evl with various concurrent processes accessing /proc/net/dev while changing the link state as well as removing/reloading the r8169 module. Signed-off-by: Corinna Vinschen Tested-by: poma Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 137 +++++++++++---------------- 1 file changed, 54 insertions(+), 83 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 24dcbe62412a..2b32e0c5a0b4 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -833,7 +833,8 @@ struct rtl8169_private { unsigned features; struct mii_if_info mii; - struct rtl8169_counters counters; + dma_addr_t counters_phys_addr; + struct rtl8169_counters *counters; struct rtl8169_tc_offsets tc_offset; u32 saved_wolopts; u32 opts1_mask; @@ -2190,53 +2191,37 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset) } } -static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev, - dma_addr_t *paddr, - u32 counter_cmd) +DECLARE_RTL_COND(rtl_counters_cond) { - struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; - struct device *d = &tp->pci_dev->dev; - struct rtl8169_counters *counters; - u32 cmd; - counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL); - if (counters) { - RTL_W32(CounterAddrHigh, (u64)*paddr >> 32); - cmd = (u64)*paddr & DMA_BIT_MASK(32); - RTL_W32(CounterAddrLow, cmd); - RTL_W32(CounterAddrLow, cmd | counter_cmd); - } - return counters; + return RTL_R32(CounterAddrLow) & (CounterReset | CounterDump); } -static void rtl8169_unmap_counters (struct net_device *dev, - dma_addr_t paddr, - struct rtl8169_counters *counters) +static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd) { struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; - struct device *d = &tp->pci_dev->dev; + dma_addr_t paddr = tp->counters_phys_addr; + u32 cmd; + bool ret; + + RTL_W32(CounterAddrHigh, (u64)paddr >> 32); + cmd = (u64)paddr & DMA_BIT_MASK(32); + RTL_W32(CounterAddrLow, cmd); + RTL_W32(CounterAddrLow, cmd | counter_cmd); + + ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); RTL_W32(CounterAddrLow, 0); RTL_W32(CounterAddrHigh, 0); - dma_free_coherent(d, sizeof(*counters), counters, paddr); -} - -DECLARE_RTL_COND(rtl_reset_counters_cond) -{ - void __iomem *ioaddr = tp->mmio_addr; - - return RTL_R32(CounterAddrLow) & CounterReset; + return ret; } static bool rtl8169_reset_counters(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); - struct rtl8169_counters *counters; - dma_addr_t paddr; - bool ret = true; /* * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the @@ -2245,32 +2230,13 @@ static bool rtl8169_reset_counters(struct net_device *dev) if (tp->mac_version < RTL_GIGA_MAC_VER_19) return true; - counters = rtl8169_map_counters(dev, &paddr, CounterReset); - if (!counters) - return false; - - if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000)) - ret = false; - - rtl8169_unmap_counters(dev, paddr, counters); - - return ret; -} - -DECLARE_RTL_COND(rtl_counters_cond) -{ - void __iomem *ioaddr = tp->mmio_addr; - - return RTL_R32(CounterAddrLow) & CounterDump; + return rtl8169_do_counters(dev, CounterReset); } static bool rtl8169_update_counters(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; - struct rtl8169_counters *counters; - dma_addr_t paddr; - bool ret = true; /* * Some chips are unable to dump tally counters when the receiver @@ -2279,23 +2245,13 @@ static bool rtl8169_update_counters(struct net_device *dev) if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0) return true; - counters = rtl8169_map_counters(dev, &paddr, CounterDump); - if (!counters) - return false; - - if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000)) - memcpy(&tp->counters, counters, sizeof(*counters)); - else - ret = false; - - rtl8169_unmap_counters(dev, paddr, counters); - - return ret; + return rtl8169_do_counters(dev, CounterDump); } static bool rtl8169_init_counter_offsets(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_counters *counters = tp->counters; bool ret = false; /* @@ -2323,9 +2279,9 @@ static bool rtl8169_init_counter_offsets(struct net_device *dev) if (rtl8169_update_counters(dev)) ret = true; - tp->tc_offset.tx_errors = tp->counters.tx_errors; - tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision; - tp->tc_offset.tx_aborted = tp->counters.tx_aborted; + tp->tc_offset.tx_errors = counters->tx_errors; + tp->tc_offset.tx_multi_collision = counters->tx_multi_collision; + tp->tc_offset.tx_aborted = counters->tx_aborted; tp->tc_offset.inited = true; return ret; @@ -2335,24 +2291,25 @@ static void rtl8169_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct rtl8169_private *tp = netdev_priv(dev); + struct rtl8169_counters *counters = tp->counters; ASSERT_RTNL(); rtl8169_update_counters(dev); - data[0] = le64_to_cpu(tp->counters.tx_packets); - data[1] = le64_to_cpu(tp->counters.rx_packets); - data[2] = le64_to_cpu(tp->counters.tx_errors); - data[3] = le32_to_cpu(tp->counters.rx_errors); - data[4] = le16_to_cpu(tp->counters.rx_missed); - data[5] = le16_to_cpu(tp->counters.align_errors); - data[6] = le32_to_cpu(tp->counters.tx_one_collision); - data[7] = le32_to_cpu(tp->counters.tx_multi_collision); - data[8] = le64_to_cpu(tp->counters.rx_unicast); - data[9] = le64_to_cpu(tp->counters.rx_broadcast); - data[10] = le32_to_cpu(tp->counters.rx_multicast); - data[11] = le16_to_cpu(tp->counters.tx_aborted); - data[12] = le16_to_cpu(tp->counters.tx_underun); + data[0] = le64_to_cpu(counters->tx_packets); + data[1] = le64_to_cpu(counters->rx_packets); + data[2] = le64_to_cpu(counters->tx_errors); + data[3] = le32_to_cpu(counters->rx_errors); + data[4] = le16_to_cpu(counters->rx_missed); + data[5] = le16_to_cpu(counters->align_errors); + data[6] = le32_to_cpu(counters->tx_one_collision); + data[7] = le32_to_cpu(counters->tx_multi_collision); + data[8] = le64_to_cpu(counters->rx_unicast); + data[9] = le64_to_cpu(counters->rx_broadcast); + data[10] = le32_to_cpu(counters->rx_multicast); + data[11] = le16_to_cpu(counters->tx_aborted); + data[12] = le16_to_cpu(counters->tx_underun); } static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data) @@ -7780,6 +7737,7 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; + struct rtl8169_counters *counters = tp->counters; unsigned int start; if (netif_running(dev)) @@ -7816,11 +7774,11 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) * Subtract values fetched during initalization. * See rtl8169_init_counter_offsets for a description why we do that. */ - stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) - + stats->tx_errors = le64_to_cpu(counters->tx_errors) - le64_to_cpu(tp->tc_offset.tx_errors); - stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) - + stats->collisions = le32_to_cpu(counters->tx_multi_collision) - le32_to_cpu(tp->tc_offset.tx_multi_collision); - stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) - + stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) - le16_to_cpu(tp->tc_offset.tx_aborted); return stats; @@ -8022,6 +7980,9 @@ static void rtl_remove_one(struct pci_dev *pdev) unregister_netdev(dev); + dma_free_coherent(&tp->pci_dev->dev, sizeof(*tp->counters), + tp->counters, tp->counters_phys_addr); + rtl_release_firmware(tp); if (pci_dev_run_wake(pdev)) @@ -8447,9 +8408,16 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->rtl_fw = RTL_FIRMWARE_UNKNOWN; + tp->counters = dma_alloc_coherent (&pdev->dev, sizeof(*tp->counters), + &tp->counters_phys_addr, GFP_KERNEL); + if (!tp->counters) { + rc = -ENOMEM; + goto err_out_msi_4; + } + rc = register_netdev(dev); if (rc < 0) - goto err_out_msi_4; + goto err_out_cnt_5; pci_set_drvdata(pdev, dev); @@ -8483,6 +8451,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) out: return rc; +err_out_cnt_5: + dma_free_coherent(&pdev->dev, sizeof(*tp->counters), tp->counters, + tp->counters_phys_addr); err_out_msi_4: netif_napi_del(&tp->napi); rtl_disable_msi(pdev, tp); From 4c82ac3c37363e8c4ded6a5fe1ec5fa756b34df3 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 10 Sep 2015 11:18:57 +0100 Subject: [PATCH 62/65] xen-netback: respect user provided max_queues Originally that parameter was always reset to num_online_cpus during module initialisation, which renders it useless. The fix is to only set max_queues to num_online_cpus when user has not provided a value. Reported-by: Johnny Strom Signed-off-by: Wei Liu Reviewed-by: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index b588b1a08cd4..abc1381264fc 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -2114,8 +2114,11 @@ static int __init netback_init(void) if (!xen_domain()) return -ENODEV; - /* Allow as many queues as there are CPUs, by default */ - xenvif_max_queues = num_online_cpus(); + /* Allow as many queues as there are CPUs if user has not + * specified a value. + */ + if (xenvif_max_queues == 0) + xenvif_max_queues = num_online_cpus(); if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) { pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n", From 32a844056fd43dda647e1c3c6b9983bdfa04d17d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 10 Sep 2015 11:18:58 +0100 Subject: [PATCH 63/65] xen-netfront: respect user provided max_queues Originally that parameter was always reset to num_online_cpus during module initialisation, which renders it useless. The fix is to only set max_queues to num_online_cpus when user has not provided a value. Signed-off-by: Wei Liu Cc: David Vrabel Reviewed-by: David Vrabel Tested-by: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index e27e6d2ea6d2..b9c637a0036b 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2132,8 +2132,11 @@ static int __init netif_init(void) pr_info("Initialising Xen virtual ethernet driver\n"); - /* Allow as many queues as there are CPUs, by default */ - xennet_max_queues = num_online_cpus(); + /* Allow as many queues as there are CPUs if user has not + * specified a value. + */ + if (xennet_max_queues == 0) + xennet_max_queues = num_online_cpus(); return xenbus_register_frontend(&netfront_driver); } From 05c5a46d71f621df620fbabbd7758ee1b44575ad Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 9 Sep 2015 21:54:37 -0700 Subject: [PATCH 64/65] tcp: generate CA_EVENT_TX_START on data frames Issuing a CC TX_START event on control frames like pure ACK is a waste of time, as a CC should not care. Following patch needs this change, as we want CUBIC to properly track idle time at a low cost, with a single TX_START being generated. Yuchung might slightly refine the condition triggering TX_START on a followup patch. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Cc: Jana Iyengar Cc: Stephen Hemminger Cc: Sangtae Ha Cc: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1188e4fcf23b..f9a8a12b62ee 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + tp->lsndtime = now; /* If it is a reply for ato after last received @@ -940,9 +943,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk_wmem_alloc. From 30927520dbae297182990bb21d08762bcc35ce1d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Sep 2015 21:55:07 -0700 Subject: [PATCH 65/65] tcp_cubic: better follow cubic curve after idle period Jana Iyengar found an interesting issue on CUBIC : The epoch is only updated/reset initially and when experiencing losses. The delta "t" of now - epoch_start can be arbitrary large after app idle as well as the bic_target. Consequentially the slope (inverse of ca->cnt) would be really large, and eventually ca->cnt would be lower-bounded in the end to 2 to have delayed-ACK slow-start behavior. This particularly shows up when slow_start_after_idle is disabled as a dangerous cwnd inflation (1.5 x RTT) after few seconds of idle time. Jana initial fix was to reset epoch_start if app limited, but Neal pointed out it would ask the CUBIC algorithm to recalculate the curve so that we again start growing steeply upward from where cwnd is now (as CUBIC does just after a loss). Ideally we'd want the cwnd growth curve to be the same shape, just shifted later in time by the amount of the idle period. Reported-by: Jana Iyengar Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Cc: Stephen Hemminger Cc: Sangtae Ha Cc: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 28011fb1f4a2..c6ded6b2a79f 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -151,6 +151,21 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } +static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_TX_START) { + s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime; + struct bictcp *ca = inet_csk_ca(sk); + + /* We were application limited (idle) for a while. + * Shift epoch_start to keep cwnd growth to cubic curve. + */ + if (ca->epoch_start && delta > 0) + ca->epoch_start += delta; + return; + } +} + /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% @@ -450,6 +465,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, + .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic",