Merge branch 'Rework-ip_ra_chain-protection'
Kirill Tkhai says:
====================
Rework ip_ra_chain protection
Commit 1215e51eda
"ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).
This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.
v3: Change patches order: [2/5] and [3/5].
v2: Fix sparse warning [4/5], as reported by kbuild test robot.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
1a2e10a240
@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Special input handler for packets caught by router alert option.
|
||||
They are selected only by protocol field, and then processed likely
|
||||
local ones; but only if someone wants them! Otherwise, router
|
||||
not running rsvpd will kill RSVP.
|
||||
|
||||
It is user level problem, what it will make with them.
|
||||
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
|
||||
but receiver should be enough clever f.e. to forward mtrace requests,
|
||||
sent to multicast group to reach destination designated router.
|
||||
*/
|
||||
|
||||
struct ip_ra_chain {
|
||||
struct ip_ra_chain __rcu *next;
|
||||
struct sock *sk;
|
||||
@ -101,8 +112,6 @@ struct ip_ra_chain {
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
extern struct ip_ra_chain __rcu *ip_ra_chain;
|
||||
|
||||
/* IP flags. */
|
||||
#define IP_CE 0x8000 /* Flag: "Congestion" */
|
||||
#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
|
||||
|
@ -49,6 +49,8 @@ struct netns_ipv4 {
|
||||
#endif
|
||||
struct ipv4_devconf *devconf_all;
|
||||
struct ipv4_devconf *devconf_dflt;
|
||||
struct ip_ra_chain __rcu *ra_chain;
|
||||
struct mutex ra_mutex;
|
||||
#ifdef CONFIG_IP_MULTIPLE_TABLES
|
||||
struct fib_rules_ops *rules_ops;
|
||||
bool fib_has_custom_rules;
|
||||
|
@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
|
||||
net->user_ns = user_ns;
|
||||
idr_init(&net->netns_ids);
|
||||
spin_lock_init(&net->nsid_lock);
|
||||
mutex_init(&net->ipv4.ra_mutex);
|
||||
|
||||
list_for_each_entry(ops, &pernet_list, list) {
|
||||
error = ops_init(ops, net);
|
||||
|
@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
|
||||
struct net_device *dev = skb->dev;
|
||||
struct net *net = dev_net(dev);
|
||||
|
||||
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
|
||||
for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
|
||||
struct sock *sk = ra->sk;
|
||||
|
||||
/* If socket is bound to an interface, only report
|
||||
@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
|
||||
*/
|
||||
if (sk && inet_sk(sk)->inet_num == protocol &&
|
||||
(!sk->sk_bound_dev_if ||
|
||||
sk->sk_bound_dev_if == dev->ifindex) &&
|
||||
net_eq(sock_net(sk), net)) {
|
||||
sk->sk_bound_dev_if == dev->ifindex)) {
|
||||
if (ip_is_fragment(ip_hdr(skb))) {
|
||||
if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
|
||||
return true;
|
||||
|
@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Special input handler for packets caught by router alert option.
|
||||
They are selected only by protocol field, and then processed likely
|
||||
local ones; but only if someone wants them! Otherwise, router
|
||||
not running rsvpd will kill RSVP.
|
||||
|
||||
It is user level problem, what it will make with them.
|
||||
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
|
||||
but receiver should be enough clever f.e. to forward mtrace requests,
|
||||
sent to multicast group to reach destination designated router.
|
||||
*/
|
||||
struct ip_ra_chain __rcu *ip_ra_chain;
|
||||
|
||||
|
||||
static void ip_ra_destroy_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
|
||||
@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
|
||||
{
|
||||
struct ip_ra_chain *ra, *new_ra;
|
||||
struct ip_ra_chain __rcu **rap;
|
||||
struct net *net = sock_net(sk);
|
||||
|
||||
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
|
||||
return -EINVAL;
|
||||
|
||||
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
|
||||
|
||||
for (rap = &ip_ra_chain;
|
||||
(ra = rtnl_dereference(*rap)) != NULL;
|
||||
mutex_lock(&net->ipv4.ra_mutex);
|
||||
for (rap = &net->ipv4.ra_chain;
|
||||
(ra = rcu_dereference_protected(*rap,
|
||||
lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
|
||||
rap = &ra->next) {
|
||||
if (ra->sk == sk) {
|
||||
if (on) {
|
||||
mutex_unlock(&net->ipv4.ra_mutex);
|
||||
kfree(new_ra);
|
||||
return -EADDRINUSE;
|
||||
}
|
||||
/* dont let ip_call_ra_chain() use sk again */
|
||||
ra->sk = NULL;
|
||||
RCU_INIT_POINTER(*rap, ra->next);
|
||||
mutex_unlock(&net->ipv4.ra_mutex);
|
||||
|
||||
if (ra->destructor)
|
||||
ra->destructor(sk);
|
||||
@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (!new_ra)
|
||||
if (!new_ra) {
|
||||
mutex_unlock(&net->ipv4.ra_mutex);
|
||||
return -ENOBUFS;
|
||||
}
|
||||
new_ra->sk = sk;
|
||||
new_ra->destructor = destructor;
|
||||
|
||||
RCU_INIT_POINTER(new_ra->next, ra);
|
||||
rcu_assign_pointer(*rap, new_ra);
|
||||
sock_hold(sk);
|
||||
mutex_unlock(&net->ipv4.ra_mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
|
||||
case MCAST_LEAVE_GROUP:
|
||||
case MCAST_LEAVE_SOURCE_GROUP:
|
||||
case MCAST_UNBLOCK_SOURCE:
|
||||
case IP_ROUTER_ALERT:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
|
||||
|
||||
/* If optlen==0, it is equivalent to val == 0 */
|
||||
|
||||
if (optname == IP_ROUTER_ALERT)
|
||||
return ip_ra_control(sk, val ? 1 : 0, NULL);
|
||||
if (ip_mroute_opt(optname))
|
||||
return ip_mroute_setsockopt(sk, optname, optval, optlen);
|
||||
|
||||
@ -1149,9 +1144,6 @@ mc_msf_out:
|
||||
goto e_inval;
|
||||
inet->mc_all = val;
|
||||
break;
|
||||
case IP_ROUTER_ALERT:
|
||||
err = ip_ra_control(sk, val ? 1 : 0, NULL);
|
||||
break;
|
||||
|
||||
case IP_FREEBIND:
|
||||
if (optlen < 1)
|
||||
|
@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
|
||||
struct net *net = sock_net(sk);
|
||||
struct mr_table *mrt;
|
||||
|
||||
ASSERT_RTNL();
|
||||
rtnl_lock();
|
||||
ipmr_for_each_table(mrt, net) {
|
||||
if (sk == rtnl_dereference(mrt->mroute_sk)) {
|
||||
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
|
||||
@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
|
||||
mroute_clean_tables(mrt, false);
|
||||
}
|
||||
}
|
||||
rtnl_unlock();
|
||||
}
|
||||
|
||||
/* Socket options and virtual interface manipulation. The whole
|
||||
@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
|
||||
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
|
||||
ret = -EACCES;
|
||||
} else {
|
||||
/* We need to unlock here because mrtsock_destruct takes
|
||||
* care of rtnl itself and we can't change that due to
|
||||
* the IP_ROUTER_ALERT setsockopt which runs without it.
|
||||
*/
|
||||
rtnl_unlock();
|
||||
ret = ip_ra_control(sk, 0, NULL);
|
||||
goto out_unlock;
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
case MRT_ADD_VIF:
|
||||
@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
|
||||
}
|
||||
out_unlock:
|
||||
rtnl_unlock();
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
|
||||
/*
|
||||
* Raw sockets may have direct kernel references. Kill them.
|
||||
*/
|
||||
rtnl_lock();
|
||||
ip_ra_control(sk, 0, NULL);
|
||||
rtnl_unlock();
|
||||
|
||||
sk_common_release(sk);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user