From a7a2563064e963bc5e3f39f533974f2730c0ff56 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Oct 2018 09:37:54 -0700
Subject: [PATCH] tcp: mitigate scheduling jitter in EDT pacing model

In commit fefa569a9d4b ("net_sched: sch_fq: account for schedule/timers
drifts") we added a mitigation for scheduling jitter in fq packet scheduler.

This patch does the same in TCP stack, now it is using EDT model.

Note that this mitigation is valid for both external (fq packet scheduler)
or internal TCP pacing.

This uses the same strategy than the above commit, allowing
a time credit of half the packet currently sent.

Consider following case :

An skb is sent, after an idle period of 300 usec.
The air-time (skb->len/pacing_rate) is 500 usec
Instead of setting the pacing timer to now+500 usec,
it will use now+min(500/2, 300) -> now+250usec

This is like having a token bucket with a depth of half
an skb.

Tested:

tc qdisc replace dev eth0 root pfifo_fast

Before
netperf -P0 -H remote -- -q 1000000000 # 8000Mbit
540000 262144 262144    10.00    7710.43

After :
netperf -P0 -H remote -- -q 1000000000 # 8000 Mbit
540000 262144 262144    10.00    7999.75   # Much closer to 8000Mbit target

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f4aa4109334a..5474c9854f25 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -985,7 +985,8 @@ static void tcp_internal_pacing(struct sock *sk)
 	sock_hold(sk);
 }
 
-static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+				      u64 prior_wstamp)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -998,7 +999,12 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
 		 * this is a minor annoyance.
 		 */
 		if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
-			tp->tcp_wstamp_ns += div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+			u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+			u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
+
+			/* take into account OS jitter */
+			len_ns -= min_t(u64, len_ns / 2, credit);
+			tp->tcp_wstamp_ns += len_ns;
 
 			tcp_internal_pacing(sk);
 		}
@@ -1029,6 +1035,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	struct sk_buff *oskb = NULL;
 	struct tcp_md5sig_key *md5;
 	struct tcphdr *th;
+	u64 prior_wstamp;
 	int err;
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -1050,7 +1057,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 			return -ENOBUFS;
 	}
 
-	/* TODO: might take care of jitter here */
+	prior_wstamp = tp->tcp_wstamp_ns;
 	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
 
 	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
@@ -1169,7 +1176,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		err = net_xmit_eval(err);
 	}
 	if (!err && oskb) {
-		tcp_update_skb_after_send(sk, oskb);
+		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
 		tcp_rate_skb_sent(sk, oskb);
 	}
 	return err;
@@ -2321,7 +2328,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp" is used as a start point for the retransmit timer */
-			tcp_update_skb_after_send(sk, skb);
+			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2896,7 +2903,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		} tcp_skb_tsorted_restore(skb);
 
 		if (!err) {
-			tcp_update_skb_after_send(sk, skb);
+			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
 			tcp_rate_skb_sent(sk, skb);
 		}
 	} else {
-- 
2.30.2