From 2bd624b4611ffee36422782d16e1c944d1351e98 Mon Sep 17 00:00:00 2001
From: Anoob Soman <anoob.soman@citrix.com>
Date: Wed, 15 Feb 2017 20:25:39 +0000
Subject: [PATCH 1/5] packet: Do not call fanout_release from atomic contexts

Commit 6664498280cf ("packet: call fanout_release, while UNREGISTERING a
netdev"), unfortunately, introduced the following issues.

1. calling mutex_lock(&fanout_mutex) (fanout_release()) from inside
rcu_read-side critical section. rcu_read_lock disables preemption, most often,
which prohibits calling sleeping functions.

[  ] include/linux/rcupdate.h:560 Illegal context switch in RCU read-side critical section!
[  ]
[  ] rcu_scheduler_active = 1, debug_locks = 0
[  ] 4 locks held by ovs-vswitchd/1969:
[  ]  #0:  (cb_lock){++++++}, at: [<ffffffff8158a6c9>] genl_rcv+0x19/0x40
[  ]  #1:  (ovs_mutex){+.+.+.}, at: [<ffffffffa04878ca>] ovs_vport_cmd_del+0x4a/0x100 [openvswitch]
[  ]  #2:  (rtnl_mutex){+.+.+.}, at: [<ffffffff81564157>] rtnl_lock+0x17/0x20
[  ]  #3:  (rcu_read_lock){......}, at: [<ffffffff81614165>] packet_notifier+0x5/0x3f0
[  ]
[  ] Call Trace:
[  ]  [<ffffffff813770c1>] dump_stack+0x85/0xc4
[  ]  [<ffffffff810c9077>] lockdep_rcu_suspicious+0x107/0x110
[  ]  [<ffffffff810a2da7>] ___might_sleep+0x57/0x210
[  ]  [<ffffffff810a2fd0>] __might_sleep+0x70/0x90
[  ]  [<ffffffff8162e80c>] mutex_lock_nested+0x3c/0x3a0
[  ]  [<ffffffff810de93f>] ? vprintk_default+0x1f/0x30
[  ]  [<ffffffff81186e88>] ? printk+0x4d/0x4f
[  ]  [<ffffffff816106dd>] fanout_release+0x1d/0xe0
[  ]  [<ffffffff81614459>] packet_notifier+0x2f9/0x3f0

2. calling mutex_lock(&fanout_mutex) inside spin_lock(&po->bind_lock).
"sleeping function called from invalid context"

[  ] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620
[  ] in_atomic(): 1, irqs_disabled(): 0, pid: 1969, name: ovs-vswitchd
[  ] INFO: lockdep is turned off.
[  ] Call Trace:
[  ]  [<ffffffff813770c1>] dump_stack+0x85/0xc4
[  ]  [<ffffffff810a2f52>] ___might_sleep+0x202/0x210
[  ]  [<ffffffff810a2fd0>] __might_sleep+0x70/0x90
[  ]  [<ffffffff8162e80c>] mutex_lock_nested+0x3c/0x3a0
[  ]  [<ffffffff816106dd>] fanout_release+0x1d/0xe0
[  ]  [<ffffffff81614459>] packet_notifier+0x2f9/0x3f0

3. calling dev_remove_pack(&fanout->prot_hook), from inside
spin_lock(&po->bind_lock) or rcu_read-side critical-section. dev_remove_pack()
-> synchronize_net(), which might sleep.

[  ] BUG: scheduling while atomic: ovs-vswitchd/1969/0x00000002
[  ] INFO: lockdep is turned off.
[  ] Call Trace:
[  ]  [<ffffffff813770c1>] dump_stack+0x85/0xc4
[  ]  [<ffffffff81186274>] __schedule_bug+0x64/0x73
[  ]  [<ffffffff8162b8cb>] __schedule+0x6b/0xd10
[  ]  [<ffffffff8162c5db>] schedule+0x6b/0x80
[  ]  [<ffffffff81630b1d>] schedule_timeout+0x38d/0x410
[  ]  [<ffffffff810ea3fd>] synchronize_sched_expedited+0x53d/0x810
[  ]  [<ffffffff810ea6de>] synchronize_rcu_expedited+0xe/0x10
[  ]  [<ffffffff8154eab5>] synchronize_net+0x35/0x50
[  ]  [<ffffffff8154eae3>] dev_remove_pack+0x13/0x20
[  ]  [<ffffffff8161077e>] fanout_release+0xbe/0xe0
[  ]  [<ffffffff81614459>] packet_notifier+0x2f9/0x3f0

4. fanout_release() races with calls from different CPU.

To fix the above problems, remove the call to fanout_release() under
rcu_read_lock(). Instead, call __dev_remove_pack(&fanout->prot_hook) and
netdev_run_todo will be happy that &dev->ptype_specific list is empty. In order
to achieve this, I moved dev_{add,remove}_pack() out of fanout_{add,release} to
__fanout_{link,unlink}. So, call to {,__}unregister_prot_hook() will make sure
fanout->prot_hook is removed as well.

Fixes: 6664498280cf ("packet: call fanout_release, while UNREGISTERING a netdev")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Anoob Soman <anoob.soman@citrix.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f03f6a53b4d..70f5b6a4683c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1497,6 +1497,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po)
 	f->arr[f->num_members] = sk;
 	smp_wmb();
 	f->num_members++;
+	if (f->num_members == 1)
+		dev_add_pack(&f->prot_hook);
 	spin_unlock(&f->lock);
 }
 
@@ -1513,6 +1515,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 	BUG_ON(i >= f->num_members);
 	f->arr[i] = f->arr[f->num_members - 1];
 	f->num_members--;
+	if (f->num_members == 0)
+		__dev_remove_pack(&f->prot_hook);
 	spin_unlock(&f->lock);
 }
 
@@ -1693,7 +1697,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		match->prot_hook.func = packet_rcv_fanout;
 		match->prot_hook.af_packet_priv = match;
 		match->prot_hook.id_match = match_fanout_group;
-		dev_add_pack(&match->prot_hook);
 		list_add(&match->list, &fanout_list);
 	}
 	err = -EINVAL;
@@ -1718,7 +1721,12 @@ out:
 	return err;
 }
 
-static void fanout_release(struct sock *sk)
+/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
+ * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
+ * It is the responsibility of the caller to call fanout_release_data() and
+ * free the returned packet_fanout (after synchronize_net())
+ */
+static struct packet_fanout *fanout_release(struct sock *sk)
 {
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f;
@@ -1728,17 +1736,17 @@ static void fanout_release(struct sock *sk)
 	if (f) {
 		po->fanout = NULL;
 
-		if (atomic_dec_and_test(&f->sk_ref)) {
+		if (atomic_dec_and_test(&f->sk_ref))
 			list_del(&f->list);
-			dev_remove_pack(&f->prot_hook);
-			fanout_release_data(f);
-			kfree(f);
-		}
+		else
+			f = NULL;
 
 		if (po->rollover)
 			kfree_rcu(po->rollover, rcu);
 	}
 	mutex_unlock(&fanout_mutex);
+
+	return f;
 }
 
 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
@@ -2912,6 +2920,7 @@ static int packet_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
+	struct packet_fanout *f;
 	struct net *net;
 	union tpacket_req_u req_u;
 
@@ -2951,9 +2960,14 @@ static int packet_release(struct socket *sock)
 		packet_set_ring(sk, &req_u, 1, 1);
 	}
 
-	fanout_release(sk);
+	f = fanout_release(sk);
 
 	synchronize_net();
+
+	if (f) {
+		fanout_release_data(f);
+		kfree(f);
+	}
 	/*
 	 *	Now the socket is dead. No more input will appear.
 	 */
@@ -3905,7 +3919,6 @@ static int packet_notifier(struct notifier_block *this,
 				}
 				if (msg == NETDEV_UNREGISTER) {
 					packet_cached_dev_reset(po);
-					fanout_release(sk);
 					po->ifindex = -1;
 					if (po->prot_hook.dev)
 						dev_put(po->prot_hook.dev);

From 785f35775d968e0f45231b754e945fcb3ed6bded Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 16 Feb 2017 12:56:10 +0300
Subject: [PATCH 2/5] dpaa_eth: small leak on error

This should be >= instead of > here.  It means that we don't increment
the free count enough so it becomes off by one.

Fixes: 9ad1a3749333 ("dpaa_eth: add support for DPAA Ethernet")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index c9b7ad65e563..726b5693ae8a 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -1668,7 +1668,7 @@ static struct sk_buff *sg_fd_to_skb(const struct dpaa_priv *priv,
 
 free_buffers:
 	/* compensate sw bpool counter changes */
-	for (i--; i > 0; i--) {
+	for (i--; i >= 0; i--) {
 		dpaa_bp = dpaa_bpid2pool(sgt[i].bpid);
 		if (dpaa_bp) {
 			count_ptr = this_cpu_ptr(dpaa_bp->percpu_count);

From 5edabca9d4cff7f1f2b68f0bac55ef99d9798ba4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 16 Feb 2017 17:22:46 +0100
Subject: [PATCH 3/5] dccp: fix freeing skb too early for IPV6_RECVPKTINFO

In the current DCCP implementation an skb for a DCCP_PKT_REQUEST packet
is forcibly freed via __kfree_skb in dccp_rcv_state_process if
dccp_v6_conn_request successfully returns.

However, if IPV6_RECVPKTINFO is set on a socket, the address of the skb
is saved to ireq->pktopts and the ref count for skb is incremented in
dccp_v6_conn_request, so skb is still in use. Nevertheless, it gets freed
in dccp_rcv_state_process.

Fix by calling consume_skb instead of doing goto discard and therefore
calling __kfree_skb.

Similar fixes for TCP:

fb7e2399ec17f1004c0e0ccfd17439f8759ede01 [TCP]: skb is unexpectedly freed.
0aea76d35c9651d55bbaf746e7914e5f9ae5a25d tcp: SYN packets are now
simply consumed

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/dccp/input.c b/net/dccp/input.c
index ba347184bda9..8fedc2d49770 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -606,7 +606,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
-			goto discard;
+			consume_skb(skb);
+			return 0;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
 			goto discard;

From 22f0708a718daea5e79de2d29b4829de016a4ff4 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 17 Feb 2017 19:14:27 +0100
Subject: [PATCH 4/5] vxlan: fix oops in dev_fill_metadata_dst

Since the commit 0c1d70af924b ("net: use dst_cache for vxlan device")
vxlan_fill_metadata_dst() calls vxlan_get_route() passing a NULL
dst_cache pointer, so the latter should explicitly check for
valid dst_cache ptr. Unfortunately the commit d71785ffc7e7 ("net: add
dst_cache to ovs vxlan lwtunnel") removed said check.

As a result is possible to trigger a null pointer access calling
vxlan_fill_metadata_dst(), e.g. with:

ovs-vsctl add-br ovs-br0
ovs-vsctl add-port ovs-br0 vxlan0 -- set interface vxlan0 \
	type=vxlan options:remote_ip=192.168.1.1 \
	options:key=1234 options:dst_port=4789 ofport_request=10
ip address add dev ovs-br0 172.16.1.2/24
ovs-vsctl set Bridge ovs-br0 ipfix=@i -- --id=@i create IPFIX \
	targets=\"172.16.1.1:1234\" sampling=1
iperf -c 172.16.1.1 -u -l 1000 -b 10M -t 1 -p 1234

This commit addresses the issue passing to vxlan_get_route() the
dst_cache already available into the lwt info processed by
vxlan_fill_metadata_dst().

Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 50b62db213b0..30b04cf2bb1e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2439,7 +2439,8 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 
 		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
 				     info->key.u.ipv4.dst,
-				     &info->key.u.ipv4.src, dport, sport, NULL, info);
+				     &info->key.u.ipv4.src, dport, sport,
+				     &info->dst_cache, info);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 		ip_rt_put(rt);
@@ -2450,7 +2451,8 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 
 		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
 					info->key.label, &info->key.u.ipv6.dst,
-					&info->key.u.ipv6.src, dport, sport, NULL, info);
+					&info->key.u.ipv6.src, dport, sport,
+					&info->dst_cache, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
 		dst_release(ndst);

From 4c03b862b12f980456f9de92db6d508a4999b788 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 17 Feb 2017 16:19:39 -0500
Subject: [PATCH 5/5] irda: Fix lockdep annotations in hashbin_delete().

A nested lock depth was added to the hasbin_delete() code but it
doesn't actually work some well and results in tons of lockdep splats.

Fix the code instead to properly drop the lock around the operation
and just keep peeking the head of the hashbin queue.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/irqueue.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index acbe61c7e683..160dc89335e2 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new);
  *    for deallocating this structure if it's complex. If not the user can
  *    just supply kfree, which should take care of the job.
  */
-#ifdef CONFIG_LOCKDEP
-static int hashbin_lock_depth = 0;
-#endif
 int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 {
 	irda_queue_t* queue;
@@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 	IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
 
 	/* Synchronize */
-	if ( hashbin->hb_type & HB_LOCK ) {
-		spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
-					 hashbin_lock_depth++);
-	}
+	if (hashbin->hb_type & HB_LOCK)
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
 
 	/*
 	 *  Free the entries in the hashbin, TODO: use hashbin_clear when
 	 *  it has been shown to work
 	 */
 	for (i = 0; i < HASHBIN_SIZE; i ++ ) {
-		queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
-		while (queue ) {
-			if (free_func)
-				(*free_func)(queue);
-			queue = dequeue_first(
-				(irda_queue_t**) &hashbin->hb_queue[i]);
+		while (1) {
+			queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+
+			if (!queue)
+				break;
+
+			if (free_func) {
+				if (hashbin->hb_type & HB_LOCK)
+					spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+				free_func(queue);
+				if (hashbin->hb_type & HB_LOCK)
+					spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+			}
 		}
 	}
 
@@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
 	hashbin->magic = ~HB_MAGIC;
 
 	/* Release lock */
-	if ( hashbin->hb_type & HB_LOCK) {
+	if (hashbin->hb_type & HB_LOCK)
 		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
-#ifdef CONFIG_LOCKDEP
-		hashbin_lock_depth--;
-#endif
-	}
 
 	/*
 	 *  Free the hashbin structure