From cb2d0f3e968bff7c6d262aca3e3ab8d4184e69b2 Mon Sep 17 00:00:00 2001 From: David Ward Date: Sun, 18 Sep 2011 12:53:20 +0000 Subject: [PATCH 1/6] macvlan/macvtap: Fix unicast between macvtap interfaces in bridge mode Packets should always be forwarded to the lowerdev using dev_forward_skb. vlan->forward is for packets being forwarded directly to another macvlan/ macvtap device (used for multicast in bridge mode). Reported-and-tested-by: Shlomo Pongratz Signed-off-by: David Ward Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 05172c39a0ce..376e3e94bae0 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -239,7 +239,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ - vlan->forward(vlan->lowerdev, skb); + dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } From 260fcbeb1ae9e768a44c9925338fbacb0d7e5ba9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 29 Sep 2011 17:10:10 +0000 Subject: [PATCH 2/6] tcp: properly handle md5sig_pool references tcp_v4_clear_md5_list() assumes that multiple tcp md5sig peers only hold one reference to md5sig_pool. but tcp_v4_md5_do_add() increases use count of md5sig_pool for each peer. This patch makes tcp_v4_md5_do_add() only increases use count for the first tcp md5sig peer. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv6/tcp_ipv6.c | 8 +++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c34f01513945..7963e03f1068 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -927,18 +927,21 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + + md5sig = tp->md5sig_info; + if (md5sig->entries4 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } - md5sig = tp->md5sig_info; if (md5sig->alloced4 == md5sig->entries4) { keys = kmalloc((sizeof(*keys) * (md5sig->entries4 + 1)), GFP_ATOMIC); if (!keys) { kfree(newkey); - tcp_free_md5sig_pool(); + if (md5sig->entries4 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -982,6 +985,7 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; + tcp_free_md5sig_pool(); } else if (tp->md5sig_info->entries4 != i) { /* Need to do some manipulation */ memmove(&tp->md5sig_info->keys4[i], @@ -989,7 +993,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) (tp->md5sig_info->entries4 - i) * sizeof(struct tcp4_md5sig_key)); } - tcp_free_md5sig_pool(); return 0; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 79cc6469508d..7b8fc5794352 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -591,7 +591,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + if (tp->md5sig_info->entries6 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } @@ -600,8 +601,9 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); if (!keys) { - tcp_free_md5sig_pool(); kfree(newkey); + if (tp->md5sig_info->entries6 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -647,6 +649,7 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) kfree(tp->md5sig_info->keys6); tp->md5sig_info->keys6 = NULL; tp->md5sig_info->alloced6 = 0; + tcp_free_md5sig_pool(); } else { /* shrink the database */ if (tp->md5sig_info->entries6 != i) @@ -655,7 +658,6 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) (tp->md5sig_info->entries6 - i) * sizeof (tp->md5sig_info->keys6[0])); } - tcp_free_md5sig_pool(); return 0; } } From 1e5289e121372a3494402b1b131b41bfe1cf9b7f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 2 Oct 2011 04:21:50 +0000 Subject: [PATCH 3/6] tcp: properly update lost_cnt_hint during shifting lost_skb_hint is used by tcp_mark_head_lost() to mark the first unhandled skb. lost_cnt_hint is the number of packets or sacked packets before the lost_skb_hint; When shifting a skb that is before the lost_skb_hint, if tcp_is_fack() is ture, the skb has already been counted in the lost_cnt_hint; if tcp_is_fack() is false, tcp_sacktag_one() will increase the lost_cnt_hint. So tcp_shifted_skb() does not need to adjust the lost_cnt_hint by itself. When shifting a skb that is equal to lost_skb_hint, the shifted packets will not be counted by tcp_mark_head_lost(). So tcp_shifted_skb() should adjust the lost_cnt_hint even tcp_is_fack(tp) is true. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21fab3edb92c..d73aab3fbfc0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1389,9 +1389,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); - /* Tweak before seqno plays */ - if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && - !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; From 3458e21c0d384ca04b27a2ea24d9314c1b57530f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 5 Oct 2011 03:24:43 +0000 Subject: [PATCH 4/6] netfilter: Use proper rwlock init function Replace the open coded initialization with the init function. Signed-off-by: Thomas Gleixner Acked-by: Hans Schillstrom Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 2b771dc708a3..5290ac353a5e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3679,7 +3679,7 @@ int __net_init ip_vs_control_net_init(struct net *net) int idx; struct netns_ipvs *ipvs = net_ipvs(net); - ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + rwlock_init(&ipvs->rs_lock); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) From b64b73d7d0c480f75684519c6134e79d50c1b341 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 3 Oct 2011 18:14:45 +0000 Subject: [PATCH 5/6] bridge: leave carrier on for empty bridge This resolves a regression seen by some users of bridging. Some users use the bridge like a dummy device. They expect to be able to put an IPv6 address on the device with no ports attached. Although there are better ways of doing this, there is no reason to not allow it. Note: the bridge still will reflect the state of ports in the bridge if there are any added. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 32b8f9f7f79e..ff3ed6086ce1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -91,7 +91,6 @@ static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - netif_carrier_off(dev); netdev_update_features(dev); netif_start_queue(dev); br_stp_enable_bridge(br); @@ -108,8 +107,6 @@ static int br_dev_stop(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - netif_carrier_off(dev); - br_stp_disable_bridge(br); br_multicast_stop(br); From 186c6bbced722cfeff041d2a1264c95f5d042050 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Tue, 4 Oct 2011 04:00:30 +0000 Subject: [PATCH 6/6] net: fix typos in Documentation/networking/scaling.txt The second hunk fixes rps_sock_flow_table but has to re-wrap the paragraph. Signed-off-by: Benjamin Poirier Signed-off-by: David S. Miller --- Documentation/networking/scaling.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index 8ce7c30e7230..fe67b5c79f0f 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -27,7 +27,7 @@ applying a filter to each packet that assigns it to one of a small number of logical flows. Packets for each flow are steered to a separate receive queue, which in turn can be processed by separate CPUs. This mechanism is generally known as “Receive-side Scaling” (RSS). The goal of RSS and -the other scaling techniques to increase performance uniformly. +the other scaling techniques is to increase performance uniformly. Multi-queue distribution can also be used for traffic prioritization, but that is not the focus of these techniques. @@ -186,10 +186,10 @@ are steered using plain RPS. Multiple table entries may point to the same CPU. Indeed, with many flows and few CPUs, it is very likely that a single application thread handles flows with many different flow hashes. -rps_sock_table is a global flow table that contains the *desired* CPU for -flows: the CPU that is currently processing the flow in userspace. Each -table value is a CPU index that is updated during calls to recvmsg and -sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() +rps_sock_flow_table is a global flow table that contains the *desired* CPU +for flows: the CPU that is currently processing the flow in userspace. +Each table value is a CPU index that is updated during calls to recvmsg +and sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() and tcp_splice_read()). When the scheduler moves a thread to a new CPU while it has outstanding