net: netem: use a list in addition to rbtree
authorPeter Oskolkov <posk@google.com>
Tue, 4 Dec 2018 19:55:56 +0000 (11:55 -0800)
committerDavid S. Miller <davem@davemloft.net>
Thu, 6 Dec 2018 04:18:41 +0000 (20:18 -0800)
When testing high-bandwidth TCP streams with large windows,
high latency, and low jitter, netem consumes a lot of CPU cycles
doing rbtree rebalancing.

This patch uses a linear list/queue in addition to the rbtree:
if an incoming packet is past the tail of the linear queue, it is
added there, otherwise it is inserted into the rbtree.

Without this patch, perf shows netem_enqueue, netem_dequeue,
and rb_* functions among the top offenders. With this patch,
only netem_enqueue is noticeable if jitter is low/absent.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/sched/sch_netem.c

index 2c38e3d0792468162ee0dc4137f1400160ab9276..84658f60a872444dbb256fe98bc5536e4f342376 100644 (file)
@@ -77,6 +77,10 @@ struct netem_sched_data {
        /* internal t(ime)fifo qdisc uses t_root and sch->limit */
        struct rb_root t_root;
 
+       /* a linear queue; reduces rbtree rebalancing when jitter is low */
+       struct sk_buff  *t_head;
+       struct sk_buff  *t_tail;
+
        /* optional qdisc for classful handling (NULL at netem init) */
        struct Qdisc    *qdisc;
 
@@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch)
                rb_erase(&skb->rbnode, &q->t_root);
                rtnl_kfree_skbs(skb, skb);
        }
+
+       rtnl_kfree_skbs(q->t_head, q->t_tail);
+       q->t_head = NULL;
+       q->t_tail = NULL;
 }
 
 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 {
        struct netem_sched_data *q = qdisc_priv(sch);
        u64 tnext = netem_skb_cb(nskb)->time_to_send;
-       struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 
-       while (*p) {
-               struct sk_buff *skb;
-
-               parent = *p;
-               skb = rb_to_skb(parent);
-               if (tnext >= netem_skb_cb(skb)->time_to_send)
-                       p = &parent->rb_right;
+       if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+               if (q->t_tail)
+                       q->t_tail->next = nskb;
                else
-                       p = &parent->rb_left;
+                       q->t_head = nskb;
+               q->t_tail = nskb;
+       } else {
+               struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+               while (*p) {
+                       struct sk_buff *skb;
+
+                       parent = *p;
+                       skb = rb_to_skb(parent);
+                       if (tnext >= netem_skb_cb(skb)->time_to_send)
+                               p = &parent->rb_right;
+                       else
+                               p = &parent->rb_left;
+               }
+               rb_link_node(&nskb->rbnode, parent, p);
+               rb_insert_color(&nskb->rbnode, &q->t_root);
        }
-       rb_link_node(&nskb->rbnode, parent, p);
-       rb_insert_color(&nskb->rbnode, &q->t_root);
        sch->q.qlen++;
 }
 
@@ -530,9 +547,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                t_skb = skb_rb_last(&q->t_root);
                                t_last = netem_skb_cb(t_skb);
                                if (!last ||
-                                   t_last->time_to_send > last->time_to_send) {
+                                   t_last->time_to_send > last->time_to_send)
+                                       last = t_last;
+                       }
+                       if (q->t_tail) {
+                               struct netem_skb_cb *t_last =
+                                       netem_skb_cb(q->t_tail);
+
+                               if (!last ||
+                                   t_last->time_to_send > last->time_to_send)
                                        last = t_last;
-                               }
                        }
 
                        if (last) {
@@ -611,11 +635,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
        q->slot.bytes_left = q->slot_config.max_bytes;
 }
 
+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+       struct sk_buff *skb = skb_rb_first(&q->t_root);
+       u64 t1, t2;
+
+       if (!skb)
+               return q->t_head;
+       if (!q->t_head)
+               return skb;
+
+       t1 = netem_skb_cb(skb)->time_to_send;
+       t2 = netem_skb_cb(q->t_head)->time_to_send;
+       if (t1 < t2)
+               return skb;
+       return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+       if (skb == q->t_head) {
+               q->t_head = skb->next;
+               if (!q->t_head)
+                       q->t_tail = NULL;
+       } else {
+               rb_erase(&skb->rbnode, &q->t_root);
+       }
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
        struct netem_sched_data *q = qdisc_priv(sch);
        struct sk_buff *skb;
-       struct rb_node *p;
 
 tfifo_dequeue:
        skb = __qdisc_dequeue_head(&sch->q);
@@ -625,20 +676,18 @@ deliver:
                qdisc_bstats_update(sch, skb);
                return skb;
        }
-       p = rb_first(&q->t_root);
-       if (p) {
+       skb = netem_peek(q);
+       if (skb) {
                u64 time_to_send;
                u64 now = ktime_get_ns();
 
-               skb = rb_to_skb(p);
-
                /* if more time remaining? */
                time_to_send = netem_skb_cb(skb)->time_to_send;
                if (q->slot.slot_next && q->slot.slot_next < time_to_send)
                        get_slot_next(q, now);
 
-               if (time_to_send <= now &&  q->slot.slot_next <= now) {
-                       rb_erase(p, &q->t_root);
+               if (time_to_send <= now && q->slot.slot_next <= now) {
+                       netem_erase_head(q, skb);
                        sch->q.qlen--;
                        qdisc_qstats_backlog_dec(sch, skb);
                        skb->next = NULL;