forked from Minki/linux
90c337da15
When an application needs to force a source IP on an active TCP socket it has to use bind(IP, port=x). As most applications do not want to deal with already used ports, x is often set to 0, meaning the kernel is in charge to find an available port. But kernel does not know yet if this socket is going to be a listener or be connected. It has very limited choices (no full knowledge of final 4-tuple for a connect()) With limited ephemeral port range (about 32K ports), it is very easy to fill the space. This patch adds a new SOL_IP socket option, asking kernel to ignore the 0 port provided by application in bind(IP, port=0) and only remember the given IP address. The port will be automatically chosen at connect() time, in a way that allows sharing a source port as long as the 4-tuples are unique. This new feature is available for both IPv4 and IPv6 (Thanks Neal) Tested: Wrote a test program and checked its behavior on IPv4 and IPv6. strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by connect(). Also getsockname() show that the port is still 0 right after bind() but properly allocated after connect(). socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5 setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 IPv6 test : socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7 setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 I was able to bind()/connect() a million concurrent IPv4 sockets, instead of ~32000 before patch. lpaa23:~# ulimit -n 1000010 lpaa23:~# ./bind --connect --num-flows=1000000 & 1000000 sockets lpaa23:~# grep TCP /proc/net/sockstat TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66 Check that a given source port is indeed used by many different connections : lpaa23:~# ss -t src :40000 | head -10 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983 ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983 ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983 ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983 ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983 ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983 ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983 ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983 ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> |
||
---|---|---|
.. | ||
netfilter | ||
af_inet.c | ||
ah4.c | ||
arp.c | ||
cipso_ipv4.c | ||
datagram.c | ||
devinet.c | ||
esp4.c | ||
fib_frontend.c | ||
fib_lookup.h | ||
fib_rules.c | ||
fib_semantics.c | ||
fib_trie.c | ||
fou.c | ||
geneve_core.c | ||
gre_demux.c | ||
gre_offload.c | ||
icmp.c | ||
igmp.c | ||
inet_connection_sock.c | ||
inet_diag.c | ||
inet_fragment.c | ||
inet_hashtables.c | ||
inet_lro.c | ||
inet_timewait_sock.c | ||
inetpeer.c | ||
ip_forward.c | ||
ip_fragment.c | ||
ip_gre.c | ||
ip_input.c | ||
ip_options.c | ||
ip_output.c | ||
ip_sockglue.c | ||
ip_tunnel_core.c | ||
ip_tunnel.c | ||
ip_vti.c | ||
ipcomp.c | ||
ipconfig.c | ||
ipip.c | ||
ipmr.c | ||
Kconfig | ||
Makefile | ||
netfilter.c | ||
ping.c | ||
proc.c | ||
protocol.c | ||
raw.c | ||
route.c | ||
syncookies.c | ||
sysctl_net_ipv4.c | ||
tcp_bic.c | ||
tcp_cong.c | ||
tcp_cubic.c | ||
tcp_dctcp.c | ||
tcp_diag.c | ||
tcp_fastopen.c | ||
tcp_highspeed.c | ||
tcp_htcp.c | ||
tcp_hybla.c | ||
tcp_illinois.c | ||
tcp_input.c | ||
tcp_ipv4.c | ||
tcp_lp.c | ||
tcp_memcontrol.c | ||
tcp_metrics.c | ||
tcp_minisocks.c | ||
tcp_offload.c | ||
tcp_output.c | ||
tcp_probe.c | ||
tcp_scalable.c | ||
tcp_timer.c | ||
tcp_vegas.c | ||
tcp_vegas.h | ||
tcp_veno.c | ||
tcp_westwood.c | ||
tcp_yeah.c | ||
tcp.c | ||
tunnel4.c | ||
udp_diag.c | ||
udp_impl.h | ||
udp_offload.c | ||
udp_tunnel.c | ||
udp.c | ||
udplite.c | ||
xfrm4_input.c | ||
xfrm4_mode_beet.c | ||
xfrm4_mode_transport.c | ||
xfrm4_mode_tunnel.c | ||
xfrm4_output.c | ||
xfrm4_policy.c | ||
xfrm4_protocol.c | ||
xfrm4_state.c | ||
xfrm4_tunnel.c |