From 1c09f7d073b1d1ce85765c5552e4b40a6b6ba770 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Nov 2018 06:41:29 -0800
Subject: [PATCH 1/3] tcp: do not try to defer skbs with eor mark (MSG_EOR)

Applications using MSG_EOR are giving a strong hint to TCP stack :

Subsequent sendmsg() can not append more bytes to skbs having
the EOR mark.

Do not try to TSO defer suchs skbs, there is really no hope.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9c34b97d365d..35feadf48030 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1944,6 +1944,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
+	/* If this packet won't get more data, do not wait. */
+	if (TCP_SKB_CB(skb)->eor)
+		goto send_now;
+
 	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
 	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);

From f1c6ea3827b5e5ec62e297bcf4ccfd065326e8f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Nov 2018 06:41:30 -0800
Subject: [PATCH 2/3] tcp: refine tcp_tso_should_defer() after EDT adoption

tcp_tso_should_defer() last step tries to check if the probable
next ACK packet is coming in less than half rtt.

Problem is that the head->tstamp might be in the future,
so we need to use signed arithmetics to avoid overflows.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 35feadf48030..78a56cef7e39 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1907,10 +1907,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 				 bool *is_cwnd_limited, u32 max_segs)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 age, send_win, cong_win, limit, in_flight;
+	u32 send_win, cong_win, limit, in_flight;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *head;
 	int win_divisor;
+	s64 delta;
 
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 		goto send_now;
@@ -1972,9 +1973,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	head = tcp_rtx_queue_head(sk);
 	if (!head)
 		goto send_now;
-	age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+	delta = tp->tcp_clock_cache - head->tstamp;
 	/* If next ACK is likely to come too late (half srtt), do not defer */
-	if (age < (tp->srtt_us >> 4))
+	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
 		goto send_now;
 
 	/* Ok, it looks like it is advisable to defer. */

From a682850a114aef947da5d603f7fd2cfe7eabbd72 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Nov 2018 06:41:31 -0800
Subject: [PATCH 3/3] tcp: get rid of tcp_tso_should_defer() dependency on
 HZ/jiffies

tcp_tso_should_defer() first heuristic is to not defer
if last send is "old enough".

Its current implementation uses jiffies and its low granularity.

TSO autodefer performance should not rely on kernel HZ :/

After EDT conversion, we have state variables in nanoseconds that
can allow us to properly implement the heuristic.

This patch increases TSO chunk sizes on medium rate flows,
especially when receivers do not use GRO or similar aggregation.

It also reduces bursts for HZ=100 or HZ=250 kernels, making TCP
behavior more uniform.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 78a56cef7e39..75dcf4daca72 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1920,9 +1920,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 		goto send_now;
 
 	/* Avoid bursty behavior by allowing defer
-	 * only if the last write was recent.
+	 * only if the last write was recent (1 ms).
+	 * Note that tp->tcp_wstamp_ns can be in the future if we have
+	 * packets waiting in a qdisc or device for EDT delivery.
 	 */
-	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+	if (delta > 0)
 		goto send_now;
 
 	in_flight = tcp_packets_in_flight(tp);