udp: implement GRO for plain UDP sockets.
authorPaolo Abeni <pabeni@redhat.com>
Wed, 7 Nov 2018 11:38:29 +0000 (12:38 +0100)
committerDavid S. Miller <davem@davemloft.net>
Thu, 8 Nov 2018 00:23:04 +0000 (16:23 -0800)
This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

rfc v2 -> rfc v3:
 - fixed typos in macro name and comments
 - really enforce UDP_GRO_CNT_MAX, instead of UDP_GRO_CNT_MAX + 1
 - acquire socket lock in UDP_GRO setsockopt

rfc v1 -> rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/udp.h
include/uapi/linux/udp.h
net/ipv4/udp.c
net/ipv4/udp_offload.c
net/ipv6/udp_offload.c

index a4dafff407fb8bd2fec7cd619cdf5dcf8245b997..f613b329852ef318687fe99786ff0b9b4a17aa38 100644 (file)
@@ -50,11 +50,12 @@ struct udp_sock {
        __u8             encap_type;    /* Is this an Encapsulation socket? */
        unsigned char    no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
                         no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-                        encap_enabled:1; /* This socket enabled encap
+                        encap_enabled:1, /* This socket enabled encap
                                           * processing; UDP tunnels and
                                           * different encapsulation layer set
                                           * this
                                           */
+                        gro_enabled:1; /* Can accept GRO packets */
        /*
         * Following member retains the information to create a UDP header
         * when the socket is uncorked.
index 09502de447f57203db8952d9c197342317da59e9..30baccb6c9c4e25980ffe01dcf425be4fe6a8949 100644 (file)
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101   /* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102   /* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT    103     /* Set GSO segmentation size */
+#define UDP_GRO                104     /* This socket can receive UDP GRO packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE     1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
index f81409921e272fec6f582a31c548fa27eeb417b3..9fc08b098ced584548a76b0c8fb3406849c3ac86 100644 (file)
@@ -2473,6 +2473,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                up->gso_size = val;
                break;
 
+       case UDP_GRO:
+               lock_sock(sk);
+               if (valbool)
+                       udp_tunnel_encap_enable(sk->sk_socket);
+               up->gro_enabled = valbool;
+               release_sock(sk);
+               break;
+
        /*
         *      UDP-Lite's partial checksum coverage (RFC 3828).
         */
index 802f2bc00d69751a40856a6f4f82b32bd244617d..0646d61f4fa832d289ee997f839cfc2f9a29a23f 100644 (file)
@@ -343,6 +343,54 @@ out:
        return segs;
 }
 
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+                                              struct sk_buff *skb)
+{
+       struct udphdr *uh = udp_hdr(skb);
+       struct sk_buff *pp = NULL;
+       struct udphdr *uh2;
+       struct sk_buff *p;
+
+       /* requires non zero csum, for symmetry with GSO */
+       if (!uh->check) {
+               NAPI_GRO_CB(skb)->flush = 1;
+               return NULL;
+       }
+
+       /* pull encapsulating udp header */
+       skb_gro_pull(skb, sizeof(struct udphdr));
+       skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+       list_for_each_entry(p, head, list) {
+               if (!NAPI_GRO_CB(p)->same_flow)
+                       continue;
+
+               uh2 = udp_hdr(p);
+
+               /* Match ports only, as csum is always non zero */
+               if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+                       NAPI_GRO_CB(p)->same_flow = 0;
+                       continue;
+               }
+
+               /* Terminate the flow on len mismatch or if it grow "too much".
+                * Under small packet flood GRO count could elsewhere grow a lot
+                * leading to execessive truesize values
+                */
+               if (!skb_gro_receive(p, skb) &&
+                   NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+                       pp = p;
+               else if (uh->len != uh2->len)
+                       pp = p;
+
+               return pp;
+       }
+
+       /* mismatch, but we never need to flush */
+       return NULL;
+}
+
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
        int flush = 1;
        struct sock *sk;
 
+       rcu_read_lock();
+       sk = (*lookup)(skb, uh->source, uh->dest);
+       if (!sk)
+               goto out_unlock;
+
+       if (udp_sk(sk)->gro_enabled) {
+               pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+               rcu_read_unlock();
+               return pp;
+       }
+
        if (NAPI_GRO_CB(skb)->encap_mark ||
            (skb->ip_summed != CHECKSUM_PARTIAL &&
             NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-            !NAPI_GRO_CB(skb)->csum_valid))
-               goto out;
+            !NAPI_GRO_CB(skb)->csum_valid) ||
+           !udp_sk(sk)->gro_receive)
+               goto out_unlock;
 
        /* mark that this skb passed once through the tunnel gro layer */
        NAPI_GRO_CB(skb)->encap_mark = 1;
 
-       rcu_read_lock();
-       sk = (*lookup)(skb, uh->source, uh->dest);
-
-       if (sk && udp_sk(sk)->gro_receive)
-               goto unflush;
-       goto out_unlock;
-
-unflush:
        flush = 0;
 
        list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ unflush:
 
 out_unlock:
        rcu_read_unlock();
-out:
        skb_gro_flush_final(skb, pp, flush);
        return pp;
 }
@@ -427,6 +478,19 @@ flush:
        return NULL;
 }
 
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+       struct udphdr *uh = udp_hdr(skb);
+
+       skb->csum_start = (unsigned char *)uh - skb->head;
+       skb->csum_offset = offsetof(struct udphdr, check);
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+       skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+       return 0;
+}
+
 int udp_gro_complete(struct sk_buff *skb, int nhoff,
                     udp_lookup_t lookup)
 {
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
        uh->len = newlen;
 
-       /* Set encapsulation before calling into inner gro_complete() functions
-        * to make them set up the inner offsets.
-        */
-       skb->encapsulation = 1;
-
        rcu_read_lock();
        sk = (*lookup)(skb, uh->source, uh->dest);
-       if (sk && udp_sk(sk)->gro_complete)
+       if (sk && udp_sk(sk)->gro_enabled) {
+               err = udp_gro_complete_segment(skb);
+       } else if (sk && udp_sk(sk)->gro_complete) {
+               skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+                                       : SKB_GSO_UDP_TUNNEL;
+
+               /* Set encapsulation before calling into inner gro_complete()
+                * functions to make them set up the inner offsets.
+                */
+               skb->encapsulation = 1;
                err = udp_sk(sk)->gro_complete(sk, skb,
                                nhoff + sizeof(struct udphdr));
+       }
        rcu_read_unlock();
 
        if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
        const struct iphdr *iph = ip_hdr(skb);
        struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-       if (uh->check) {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+       if (uh->check)
                uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
                                          iph->daddr, 0);
-       } else {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-       }
 
        return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
index 1b8e161ac527dd1c2f594680ef532cd7cf453378..828b2457f97b289edae92bfa45e99d0976c7127c 100644 (file)
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
        const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-       if (uh->check) {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+       if (uh->check)
                uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
                                          &ipv6h->daddr, 0);
-       } else {
-               skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-       }
 
        return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }