selftests/bpf: test bpf flow dissection
authorPetar Penkov <ppenkov@google.com>
Fri, 14 Sep 2018 14:46:22 +0000 (07:46 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Fri, 14 Sep 2018 19:04:33 +0000 (12:04 -0700)
Adds a test that sends different types of packets over multiple
tunnels and verifies that valid packets are dissected correctly.  To do
so, a tc-flower rule is added to drop packets on UDP src port 9, and
packets are sent from ports 8, 9, and 10. Only the packets on port 9
should be dropped. Because tc-flower relies on the flow dissector to
match flows, correct classification demonstrates correct dissection.

Also add support logic to load the BPF program and to inject the test
packets.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
tools/testing/selftests/bpf/.gitignore
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/config
tools/testing/selftests/bpf/flow_dissector_load.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_flow_dissector.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_flow_dissector.sh [new file with mode: 0755]
tools/testing/selftests/bpf/with_addr.sh [new file with mode: 0755]
tools/testing/selftests/bpf/with_tunnels.sh [new file with mode: 0755]

index 4d789c1e5167a4e159485d6bc8b84472c1243b1b..8a60c9b9892d2dcb1f4eb9843bc2a7787ddfad3b 100644 (file)
@@ -23,3 +23,5 @@ test_skb_cgroup_id_user
 test_socket_cookie
 test_cgroup_storage
 test_select_reuseport
+test_flow_dissector
+flow_dissector_load
index e65f50f9185e9d641709f091c051819406e4e0fb..fd3851d5c07921caccbca2127afde286935fa1b5 100644 (file)
@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \
        test_tunnel.sh \
        test_lwt_seg6local.sh \
        test_lirc_mode2.sh \
-       test_skb_cgroup_id.sh
+       test_skb_cgroup_id.sh \
+       test_flow_dissector.sh
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
+       flow_dissector_load test_flow_dissector
 
 include ../lib.mk
 
index b4994a94968bfd9d12965fd630cba7e99458a30a..3655508f95fdcd2b5eda2de72e62804de39b0928 100644 (file)
@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m
 CONFIG_CRYPTO_SHA256=m
 CONFIG_VXLAN=y
 CONFIG_GENEVE=y
+CONFIG_NET_CLS_FLOWER=m
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c
new file mode 100644 (file)
index 0000000..d3273b5
--- /dev/null
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
+const char *cfg_map_name = "jmp_table";
+bool cfg_attach = true;
+char *cfg_section_name;
+char *cfg_path_name;
+
+static void load_and_attach_program(void)
+{
+       struct bpf_program *prog, *main_prog;
+       struct bpf_map *prog_array;
+       int i, fd, prog_fd, ret;
+       struct bpf_object *obj;
+       int prog_array_fd;
+
+       ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj,
+                           &prog_fd);
+       if (ret)
+               error(1, 0, "bpf_prog_load %s", cfg_path_name);
+
+       main_prog = bpf_object__find_program_by_title(obj, cfg_section_name);
+       if (!main_prog)
+               error(1, 0, "bpf_object__find_program_by_title %s",
+                     cfg_section_name);
+
+       prog_fd = bpf_program__fd(main_prog);
+       if (prog_fd < 0)
+               error(1, 0, "bpf_program__fd");
+
+       prog_array = bpf_object__find_map_by_name(obj, cfg_map_name);
+       if (!prog_array)
+               error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name);
+
+       prog_array_fd = bpf_map__fd(prog_array);
+       if (prog_array_fd < 0)
+               error(1, 0, "bpf_map__fd %s", cfg_map_name);
+
+       i = 0;
+       bpf_object__for_each_program(prog, obj) {
+               fd = bpf_program__fd(prog);
+               if (fd < 0)
+                       error(1, 0, "bpf_program__fd");
+
+               if (fd != prog_fd) {
+                       printf("%d: %s\n", i, bpf_program__title(prog, false));
+                       bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
+                       ++i;
+               }
+       }
+
+       ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0);
+       if (ret)
+               error(1, 0, "bpf_prog_attach %s", cfg_path_name);
+
+       ret = bpf_object__pin(obj, cfg_pin_path);
+       if (ret)
+               error(1, 0, "bpf_object__pin %s", cfg_pin_path);
+
+}
+
+static void detach_program(void)
+{
+       char command[64];
+       int ret;
+
+       ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR);
+       if (ret)
+               error(1, 0, "bpf_prog_detach");
+
+       /* To unpin, it is necessary and sufficient to just remove this dir */
+       sprintf(command, "rm -r %s", cfg_pin_path);
+       ret = system(command);
+       if (ret)
+               error(1, errno, command);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+       bool attach = false;
+       bool detach = false;
+       int c;
+
+       while ((c = getopt(argc, argv, "adp:s:")) != -1) {
+               switch (c) {
+               case 'a':
+                       if (detach)
+                               error(1, 0, "attach/detach are exclusive");
+                       attach = true;
+                       break;
+               case 'd':
+                       if (attach)
+                               error(1, 0, "attach/detach are exclusive");
+                       detach = true;
+                       break;
+               case 'p':
+                       if (cfg_path_name)
+                               error(1, 0, "only one prog name can be given");
+
+                       cfg_path_name = optarg;
+                       break;
+               case 's':
+                       if (cfg_section_name)
+                               error(1, 0, "only one section can be given");
+
+                       cfg_section_name = optarg;
+                       break;
+               }
+       }
+
+       if (detach)
+               cfg_attach = false;
+
+       if (cfg_attach && !cfg_path_name)
+               error(1, 0, "must provide a path to the BPF program");
+
+       if (cfg_attach && !cfg_section_name)
+               error(1, 0, "must provide a section name");
+}
+
+int main(int argc, char **argv)
+{
+       parse_opts(argc, argv);
+       if (cfg_attach)
+               load_and_attach_program();
+       else
+               detach_program();
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
new file mode 100644 (file)
index 0000000..12b784a
--- /dev/null
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inject packets with all sorts of encapsulation into the kernel.
+ *
+ * IPv4/IPv6   outer layer 3
+ * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
+ * IPv4/IPv6    inner layer 3
+ */
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <error.h>
+#include <errno.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <netinet/ip.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define CFG_PORT_INNER 8000
+
+/* Add some protocol definitions that do not exist in userspace */
+
+struct grehdr {
+       uint16_t unused;
+       uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+       union {
+               struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+                       __u8    hlen:5,
+                               control:1,
+                               version:2;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+                       __u8    version:2,
+                               control:1,
+                               hlen:5;
+#else
+#error  "Please fix <asm/byteorder.h>"
+#endif
+                       __u8    proto_ctype;
+                       __be16  flags;
+               };
+               __be32  word;
+       };
+};
+
+static uint8_t cfg_dsfield_inner;
+static uint8_t cfg_dsfield_outer;
+static uint8_t cfg_encap_proto;
+static bool    cfg_expect_failure = false;
+static int     cfg_l3_extra = AF_UNSPEC;       /* optional SIT prefix */
+static int     cfg_l3_inner = AF_UNSPEC;
+static int     cfg_l3_outer = AF_UNSPEC;
+static int     cfg_num_pkt = 10;
+static int     cfg_num_secs = 0;
+static char    cfg_payload_char = 'a';
+static int     cfg_payload_len = 100;
+static int     cfg_port_gue = 6080;
+static bool    cfg_only_rx;
+static bool    cfg_only_tx;
+static int     cfg_src_port = 9;
+
+static char    buf[ETH_DATA_LEN];
+
+#define INIT_ADDR4(name, addr4, port)                          \
+       static struct sockaddr_in name = {                      \
+               .sin_family = AF_INET,                          \
+               .sin_port = __constant_htons(port),             \
+               .sin_addr.s_addr = __constant_htonl(addr4),     \
+       };
+
+#define INIT_ADDR6(name, addr6, port)                          \
+       static struct sockaddr_in6 name = {                     \
+               .sin6_family = AF_INET6,                        \
+               .sin6_port = __constant_htons(port),            \
+               .sin6_addr = addr6,                             \
+       };
+
+INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
+INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
+INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
+INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
+
+INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
+INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+
+static unsigned long util_gettime(void)
+{
+       struct timeval tv;
+
+       gettimeofday(&tv, NULL);
+       return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void util_printaddr(const char *msg, struct sockaddr *addr)
+{
+       unsigned long off = 0;
+       char nbuf[INET6_ADDRSTRLEN];
+
+       switch (addr->sa_family) {
+       case PF_INET:
+               off = __builtin_offsetof(struct sockaddr_in, sin_addr);
+               break;
+       case PF_INET6:
+               off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
+               break;
+       default:
+               error(1, 0, "printaddr: unsupported family %u\n",
+                     addr->sa_family);
+       }
+
+       if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
+                      sizeof(nbuf)))
+               error(1, errno, "inet_ntop");
+
+       fprintf(stderr, "%s: %s\n", msg, nbuf);
+}
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+       unsigned long sum = 0;
+       int i;
+
+       for (i = 0; i < num_u16; i++)
+               sum += start[i];
+
+       return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+                             unsigned long sum)
+{
+       sum += add_csum_hword(start, num_u16);
+
+       while (sum >> 16)
+               sum = (sum & 0xffff) + (sum >> 16);
+
+       return ~sum;
+}
+
+static void build_ipv4_header(void *header, uint8_t proto,
+                             uint32_t src, uint32_t dst,
+                             int payload_len, uint8_t tos)
+{
+       struct iphdr *iph = header;
+
+       iph->ihl = 5;
+       iph->version = 4;
+       iph->tos = tos;
+       iph->ttl = 8;
+       iph->tot_len = htons(sizeof(*iph) + payload_len);
+       iph->id = htons(1337);
+       iph->protocol = proto;
+       iph->saddr = src;
+       iph->daddr = dst;
+       iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+}
+
+static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+       uint16_t val, *ptr = (uint16_t *)ip6h;
+
+       val = ntohs(*ptr);
+       val &= 0xF00F;
+       val |= ((uint16_t) dsfield) << 4;
+       *ptr = htons(val);
+}
+
+static void build_ipv6_header(void *header, uint8_t proto,
+                             struct sockaddr_in6 *src,
+                             struct sockaddr_in6 *dst,
+                             int payload_len, uint8_t dsfield)
+{
+       struct ipv6hdr *ip6h = header;
+
+       ip6h->version = 6;
+       ip6h->payload_len = htons(payload_len);
+       ip6h->nexthdr = proto;
+       ip6h->hop_limit = 8;
+       ipv6_set_dsfield(ip6h, dsfield);
+
+       memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
+       memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
+}
+
+static uint16_t build_udp_v4_csum(const struct iphdr *iph,
+                                 const struct udphdr *udph,
+                                 int num_words)
+{
+       unsigned long pseudo_sum;
+       int num_u16 = sizeof(iph->saddr);       /* halfwords: twice byte len */
+
+       pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
+       pseudo_sum += htons(IPPROTO_UDP);
+       pseudo_sum += udph->len;
+       return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
+                                 const struct udphdr *udph,
+                                 int num_words)
+{
+       unsigned long pseudo_sum;
+       int num_u16 = sizeof(ip6h->saddr);      /* halfwords: twice byte len */
+
+       pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
+       pseudo_sum += htons(ip6h->nexthdr);
+       pseudo_sum += ip6h->payload_len;
+       return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static void build_udp_header(void *header, int payload_len,
+                            uint16_t dport, int family)
+{
+       struct udphdr *udph = header;
+       int len = sizeof(*udph) + payload_len;
+
+       udph->source = htons(cfg_src_port);
+       udph->dest = htons(dport);
+       udph->len = htons(len);
+       udph->check = 0;
+       if (family == AF_INET)
+               udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
+                                               udph, len >> 1);
+       else
+               udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
+                                               udph, len >> 1);
+}
+
+static void build_gue_header(void *header, uint8_t proto)
+{
+       struct guehdr *gueh = header;
+
+       gueh->proto_ctype = proto;
+}
+
+static void build_gre_header(void *header, uint16_t proto)
+{
+       struct grehdr *greh = header;
+
+       greh->protocol = htons(proto);
+}
+
+static int l3_length(int family)
+{
+       if (family == AF_INET)
+               return sizeof(struct iphdr);
+       else
+               return sizeof(struct ipv6hdr);
+}
+
+static int build_packet(void)
+{
+       int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
+       int el3_len = 0;
+
+       if (cfg_l3_extra)
+               el3_len = l3_length(cfg_l3_extra);
+
+       /* calculate header offsets */
+       if (cfg_encap_proto) {
+               ol3_len = l3_length(cfg_l3_outer);
+
+               if (cfg_encap_proto == IPPROTO_GRE)
+                       ol4_len = sizeof(struct grehdr);
+               else if (cfg_encap_proto == IPPROTO_UDP)
+                       ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
+       }
+
+       il3_len = l3_length(cfg_l3_inner);
+       il4_len = sizeof(struct udphdr);
+
+       if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
+           sizeof(buf))
+               error(1, 0, "packet too large\n");
+
+       /*
+        * Fill packet from inside out, to calculate correct checksums.
+        * But create ip before udp headers, as udp uses ip for pseudo-sum.
+        */
+       memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+              cfg_payload_char, cfg_payload_len);
+
+       /* add zero byte for udp csum padding */
+       buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
+
+       switch (cfg_l3_inner) {
+       case PF_INET:
+               build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+                                 IPPROTO_UDP,
+                                 in_saddr4.sin_addr.s_addr,
+                                 in_daddr4.sin_addr.s_addr,
+                                 il4_len + cfg_payload_len,
+                                 cfg_dsfield_inner);
+               break;
+       case PF_INET6:
+               build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+                                 IPPROTO_UDP,
+                                 &in_saddr6, &in_daddr6,
+                                 il4_len + cfg_payload_len,
+                                 cfg_dsfield_inner);
+               break;
+       }
+
+       build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+                        cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
+
+       if (!cfg_encap_proto)
+               return il3_len + il4_len + cfg_payload_len;
+
+       switch (cfg_l3_outer) {
+       case PF_INET:
+               build_ipv4_header(buf + el3_len, cfg_encap_proto,
+                                 out_saddr4.sin_addr.s_addr,
+                                 out_daddr4.sin_addr.s_addr,
+                                 ol4_len + il3_len + il4_len + cfg_payload_len,
+                                 cfg_dsfield_outer);
+               break;
+       case PF_INET6:
+               build_ipv6_header(buf + el3_len, cfg_encap_proto,
+                                 &out_saddr6, &out_daddr6,
+                                 ol4_len + il3_len + il4_len + cfg_payload_len,
+                                 cfg_dsfield_outer);
+               break;
+       }
+
+       switch (cfg_encap_proto) {
+       case IPPROTO_UDP:
+               build_gue_header(buf + el3_len + ol3_len + ol4_len -
+                                sizeof(struct guehdr),
+                                cfg_l3_inner == PF_INET ? IPPROTO_IPIP
+                                                        : IPPROTO_IPV6);
+               build_udp_header(buf + el3_len + ol3_len,
+                                sizeof(struct guehdr) + il3_len + il4_len +
+                                cfg_payload_len,
+                                cfg_port_gue, cfg_l3_outer);
+               break;
+       case IPPROTO_GRE:
+               build_gre_header(buf + el3_len + ol3_len,
+                                cfg_l3_inner == PF_INET ? ETH_P_IP
+                                                        : ETH_P_IPV6);
+               break;
+       }
+
+       switch (cfg_l3_extra) {
+       case PF_INET:
+               build_ipv4_header(buf,
+                                 cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+                                                         : IPPROTO_IPV6,
+                                 extra_saddr4.sin_addr.s_addr,
+                                 extra_daddr4.sin_addr.s_addr,
+                                 ol3_len + ol4_len + il3_len + il4_len +
+                                 cfg_payload_len, 0);
+               break;
+       case PF_INET6:
+               build_ipv6_header(buf,
+                                 cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+                                                         : IPPROTO_IPV6,
+                                 &extra_saddr6, &extra_daddr6,
+                                 ol3_len + ol4_len + il3_len + il4_len +
+                                 cfg_payload_len, 0);
+               break;
+       }
+
+       return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+              cfg_payload_len;
+}
+
+/* sender transmits encapsulated over RAW or unencap'd over UDP */
+static int setup_tx(void)
+{
+       int family, fd, ret;
+
+       if (cfg_l3_extra)
+               family = cfg_l3_extra;
+       else if (cfg_l3_outer)
+               family = cfg_l3_outer;
+       else
+               family = cfg_l3_inner;
+
+       fd = socket(family, SOCK_RAW, IPPROTO_RAW);
+       if (fd == -1)
+               error(1, errno, "socket tx");
+
+       if (cfg_l3_extra) {
+               if (cfg_l3_extra == PF_INET)
+                       ret = connect(fd, (void *) &extra_daddr4,
+                                     sizeof(extra_daddr4));
+               else
+                       ret = connect(fd, (void *) &extra_daddr6,
+                                     sizeof(extra_daddr6));
+               if (ret)
+                       error(1, errno, "connect tx");
+       } else if (cfg_l3_outer) {
+               /* connect to destination if not encapsulated */
+               if (cfg_l3_outer == PF_INET)
+                       ret = connect(fd, (void *) &out_daddr4,
+                                     sizeof(out_daddr4));
+               else
+                       ret = connect(fd, (void *) &out_daddr6,
+                                     sizeof(out_daddr6));
+               if (ret)
+                       error(1, errno, "connect tx");
+       } else {
+               /* otherwise using loopback */
+               if (cfg_l3_inner == PF_INET)
+                       ret = connect(fd, (void *) &in_daddr4,
+                                     sizeof(in_daddr4));
+               else
+                       ret = connect(fd, (void *) &in_daddr6,
+                                     sizeof(in_daddr6));
+               if (ret)
+                       error(1, errno, "connect tx");
+       }
+
+       return fd;
+}
+
+/* receiver reads unencapsulated UDP */
+static int setup_rx(void)
+{
+       int fd, ret;
+
+       fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
+       if (fd == -1)
+               error(1, errno, "socket rx");
+
+       if (cfg_l3_inner == PF_INET)
+               ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
+       else
+               ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
+       if (ret)
+               error(1, errno, "bind rx");
+
+       return fd;
+}
+
+static int do_tx(int fd, const char *pkt, int len)
+{
+       int ret;
+
+       ret = write(fd, pkt, len);
+       if (ret == -1)
+               error(1, errno, "send");
+       if (ret != len)
+               error(1, errno, "send: len (%d < %d)\n", ret, len);
+
+       return 1;
+}
+
+static int do_poll(int fd, short events, int timeout)
+{
+       struct pollfd pfd;
+       int ret;
+
+       pfd.fd = fd;
+       pfd.events = events;
+
+       ret = poll(&pfd, 1, timeout);
+       if (ret == -1)
+               error(1, errno, "poll");
+       if (ret && !(pfd.revents & POLLIN))
+               error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
+
+       return ret;
+}
+
+static int do_rx(int fd)
+{
+       char rbuf;
+       int ret, num = 0;
+
+       while (1) {
+               ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
+               if (ret == -1 && errno == EAGAIN)
+                       break;
+               if (ret == -1)
+                       error(1, errno, "recv");
+               if (rbuf != cfg_payload_char)
+                       error(1, 0, "recv: payload mismatch");
+               num++;
+       };
+
+       return num;
+}
+
+static int do_main(void)
+{
+       unsigned long tstop, treport, tcur;
+       int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
+
+       if (!cfg_only_tx)
+               fdr = setup_rx();
+       if (!cfg_only_rx)
+               fdt = setup_tx();
+
+       len = build_packet();
+
+       tcur = util_gettime();
+       treport = tcur + 1000;
+       tstop = tcur + (cfg_num_secs * 1000);
+
+       while (1) {
+               if (!cfg_only_rx)
+                       tx += do_tx(fdt, buf, len);
+
+               if (!cfg_only_tx)
+                       rx += do_rx(fdr);
+
+               if (cfg_num_secs) {
+                       tcur = util_gettime();
+                       if (tcur >= tstop)
+                               break;
+                       if (tcur >= treport) {
+                               fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+                               tx = 0;
+                               rx = 0;
+                               treport = tcur + 1000;
+                       }
+               } else {
+                       if (tx == cfg_num_pkt)
+                               break;
+               }
+       }
+
+       /* read straggler packets, if any */
+       if (rx < tx) {
+               tstop = util_gettime() + 100;
+               while (rx < tx) {
+                       tcur = util_gettime();
+                       if (tcur >= tstop)
+                               break;
+
+                       do_poll(fdr, POLLIN, tstop - tcur);
+                       rx += do_rx(fdr);
+               }
+       }
+
+       fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+
+       if (fdr != -1 && close(fdr))
+               error(1, errno, "close rx");
+       if (fdt != -1 && close(fdt))
+               error(1, errno, "close tx");
+
+       /*
+        * success (== 0) only if received all packets
+        * unless failure is expected, in which case none must arrive.
+        */
+       if (cfg_expect_failure)
+               return rx != 0;
+       else
+               return rx != tx;
+}
+
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+       fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
+                       "[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
+                       "[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
+                       "[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
+               filepath);
+       exit(1);
+}
+
+static void parse_addr(int family, void *addr, const char *optarg)
+{
+       int ret;
+
+       ret = inet_pton(family, optarg, addr);
+       if (ret == -1)
+               error(1, errno, "inet_pton");
+       if (ret == 0)
+               error(1, 0, "inet_pton: bad string");
+}
+
+static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
+{
+       parse_addr(AF_INET, &addr->sin_addr, optarg);
+}
+
+static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
+{
+       parse_addr(AF_INET6, &addr->sin6_addr, optarg);
+}
+
+static int parse_protocol_family(const char *filepath, const char *optarg)
+{
+       if (!strcmp(optarg, "4"))
+               return PF_INET;
+       if (!strcmp(optarg, "6"))
+               return PF_INET6;
+
+       usage(filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+       int c;
+
+       while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
+               switch (c) {
+               case 'd':
+                       if (cfg_l3_outer == AF_UNSPEC)
+                               error(1, 0, "-d must be preceded by -o");
+                       if (cfg_l3_outer == AF_INET)
+                               parse_addr4(&out_daddr4, optarg);
+                       else
+                               parse_addr6(&out_daddr6, optarg);
+                       break;
+               case 'D':
+                       if (cfg_l3_inner == AF_UNSPEC)
+                               error(1, 0, "-D must be preceded by -i");
+                       if (cfg_l3_inner == AF_INET)
+                               parse_addr4(&in_daddr4, optarg);
+                       else
+                               parse_addr6(&in_daddr6, optarg);
+                       break;
+               case 'e':
+                       if (!strcmp(optarg, "gre"))
+                               cfg_encap_proto = IPPROTO_GRE;
+                       else if (!strcmp(optarg, "gue"))
+                               cfg_encap_proto = IPPROTO_UDP;
+                       else if (!strcmp(optarg, "bare"))
+                               cfg_encap_proto = IPPROTO_IPIP;
+                       else if (!strcmp(optarg, "none"))
+                               cfg_encap_proto = IPPROTO_IP;   /* == 0 */
+                       else
+                               usage(argv[0]);
+                       break;
+               case 'f':
+                       cfg_src_port = strtol(optarg, NULL, 0);
+                       break;
+               case 'F':
+                       cfg_expect_failure = true;
+                       break;
+               case 'h':
+                       usage(argv[0]);
+                       break;
+               case 'i':
+                       if (!strcmp(optarg, "4"))
+                               cfg_l3_inner = PF_INET;
+                       else if (!strcmp(optarg, "6"))
+                               cfg_l3_inner = PF_INET6;
+                       else
+                               usage(argv[0]);
+                       break;
+               case 'l':
+                       cfg_payload_len = strtol(optarg, NULL, 0);
+                       break;
+               case 'n':
+                       cfg_num_pkt = strtol(optarg, NULL, 0);
+                       break;
+               case 'o':
+                       cfg_l3_outer = parse_protocol_family(argv[0], optarg);
+                       break;
+               case 'O':
+                       cfg_l3_extra = parse_protocol_family(argv[0], optarg);
+                       break;
+               case 'R':
+                       cfg_only_rx = true;
+                       break;
+               case 's':
+                       if (cfg_l3_outer == AF_INET)
+                               parse_addr4(&out_saddr4, optarg);
+                       else
+                               parse_addr6(&out_saddr6, optarg);
+                       break;
+               case 'S':
+                       if (cfg_l3_inner == AF_INET)
+                               parse_addr4(&in_saddr4, optarg);
+                       else
+                               parse_addr6(&in_saddr6, optarg);
+                       break;
+               case 't':
+                       cfg_num_secs = strtol(optarg, NULL, 0);
+                       break;
+               case 'T':
+                       cfg_only_tx = true;
+                       break;
+               case 'x':
+                       cfg_dsfield_outer = strtol(optarg, NULL, 0);
+                       break;
+               case 'X':
+                       cfg_dsfield_inner = strtol(optarg, NULL, 0);
+                       break;
+               }
+       }
+
+       if (cfg_only_rx && cfg_only_tx)
+               error(1, 0, "options: cannot combine rx-only and tx-only");
+
+       if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
+               error(1, 0, "options: must specify outer with encap");
+       else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
+               error(1, 0, "options: cannot combine no-encap and outer");
+       else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
+               error(1, 0, "options: cannot combine no-encap and extra");
+
+       if (cfg_l3_inner == AF_UNSPEC)
+               cfg_l3_inner = AF_INET6;
+       if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
+               cfg_encap_proto = IPPROTO_IPV6;
+
+       /* RFC 6040 4.2:
+        *   on decap, if outer encountered congestion (CE == 0x3),
+        *   but inner cannot encode ECN (NoECT == 0x0), then drop packet.
+        */
+       if (((cfg_dsfield_outer & 0x3) == 0x3) &&
+           ((cfg_dsfield_inner & 0x3) == 0x0))
+               cfg_expect_failure = true;
+}
+
+static void print_opts(void)
+{
+       if (cfg_l3_inner == PF_INET6) {
+               util_printaddr("inner.dest6", (void *) &in_daddr6);
+               util_printaddr("inner.source6", (void *) &in_saddr6);
+       } else {
+               util_printaddr("inner.dest4", (void *) &in_daddr4);
+               util_printaddr("inner.source4", (void *) &in_saddr4);
+       }
+
+       if (!cfg_l3_outer)
+               return;
+
+       fprintf(stderr, "encap proto:   %u\n", cfg_encap_proto);
+
+       if (cfg_l3_outer == PF_INET6) {
+               util_printaddr("outer.dest6", (void *) &out_daddr6);
+               util_printaddr("outer.source6", (void *) &out_saddr6);
+       } else {
+               util_printaddr("outer.dest4", (void *) &out_daddr4);
+               util_printaddr("outer.source4", (void *) &out_saddr4);
+       }
+
+       if (!cfg_l3_extra)
+               return;
+
+       if (cfg_l3_outer == PF_INET6) {
+               util_printaddr("extra.dest6", (void *) &extra_daddr6);
+               util_printaddr("extra.source6", (void *) &extra_saddr6);
+       } else {
+               util_printaddr("extra.dest4", (void *) &extra_daddr4);
+               util_printaddr("extra.source4", (void *) &extra_saddr4);
+       }
+
+}
+
+int main(int argc, char **argv)
+{
+       parse_opts(argc, argv);
+       print_opts();
+       return do_main();
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
new file mode 100755 (executable)
index 0000000..c0fb073
--- /dev/null
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Load BPF flow dissector and verify it correctly dissects traffic
+export TESTNAME=test_flow_dissector
+unmount=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+       echo $msg please run this as root >&2
+       exit $ksft_skip
+fi
+
+# This test needs to be run in a network namespace with in_netns.sh. Check if
+# this is the case and run it with in_netns.sh if it is being run in the root
+# namespace.
+if [[ -z $(ip netns identify $$) ]]; then
+       ../net/in_netns.sh "$0" "$@"
+       exit $?
+fi
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+       if (( $? == 0 )); then
+               echo "selftests: $TESTNAME [PASS]";
+       else
+               echo "selftests: $TESTNAME [FAILED]";
+       fi
+
+       set +e
+
+       # Cleanup
+       tc filter del dev lo ingress pref 1337 2> /dev/null
+       tc qdisc del dev lo ingress 2> /dev/null
+       ./flow_dissector_load -d 2> /dev/null
+       if [ $unmount -ne 0 ]; then
+               umount bpffs 2> /dev/null
+       fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+# Mount BPF file system
+if /bin/mount | grep /sys/fs/bpf > /dev/null; then
+       echo "bpffs already mounted"
+else
+       echo "bpffs not mounted. Mounting..."
+       unmount=1
+       /bin/mount bpffs /sys/fs/bpf -t bpf
+fi
+
+# Attach BPF program
+./flow_dissector_load -p bpf_flow.o -s dissect
+
+# Setup
+tc qdisc add dev lo ingress
+
+echo "Testing IPv4..."
+# Drops all IP/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
+       udp src_port 9 action drop
+
+# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 4 -f 8
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -f 9 -F
+# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 4 -f 10
+
+echo "Testing IPIP..."
+# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+echo "Testing IPv4 + GRE..."
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+       -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+tc filter del dev lo ingress pref 1337
+
+echo "Testing IPv6..."
+# Drops all IPv6/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
+       udp src_port 9 action drop
+
+# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 6 -f 8
+# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 6 -f 9 -F
+# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 6 -f 10
+
+exit 0
diff --git a/tools/testing/selftests/bpf/with_addr.sh b/tools/testing/selftests/bpf/with_addr.sh
new file mode 100755 (executable)
index 0000000..ffcd395
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# add private ipv4 and ipv6 addresses to loopback
+
+readonly V6_INNER='100::a/128'
+readonly V4_INNER='192.168.0.1/32'
+
+if getopts ":s" opt; then
+  readonly SIT_DEV_NAME='sixtofourtest0'
+  readonly V6_SIT='2::/64'
+  readonly V4_SIT='172.17.0.1/32'
+  shift
+fi
+
+fail() {
+  echo "error: $*" 1>&2
+  exit 1
+}
+
+setup() {
+  ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address'
+  ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address'
+
+  if [[ -n "${V6_SIT}" ]]; then
+    ip link add "${SIT_DEV_NAME}" type sit remote any local any \
+           || fail 'failed to add sit'
+    ip link set dev "${SIT_DEV_NAME}" up \
+           || fail 'failed to bring sit device up'
+    ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \
+           || fail 'failed to setup v6 SIT address'
+    ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \
+           || fail 'failed to setup v4 SIT address'
+  fi
+
+  sleep 2      # avoid race causing bind to fail
+}
+
+cleanup() {
+  if [[ -n "${V6_SIT}" ]]; then
+    ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}"
+    ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}"
+    ip link del "${SIT_DEV_NAME}"
+  fi
+
+  ip -4 addr del "${V4_INNER}" dev lo
+  ip -6 addr del "${V6_INNER}" dev lo
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
diff --git a/tools/testing/selftests/bpf/with_tunnels.sh b/tools/testing/selftests/bpf/with_tunnels.sh
new file mode 100755 (executable)
index 0000000..e24949e
--- /dev/null
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# setup tunnels for flow dissection test
+
+readonly SUFFIX="test_$(mktemp -u XXXX)"
+CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo"
+
+setup() {
+  ip link add "ipip_${SUFFIX}" type ipip ${CONFIG}
+  ip link add "gre_${SUFFIX}" type gre ${CONFIG}
+  ip link add "sit_${SUFFIX}" type sit ${CONFIG}
+
+  echo "tunnels before test:"
+  ip tunnel show
+
+  ip link set "ipip_${SUFFIX}" up
+  ip link set "gre_${SUFFIX}" up
+  ip link set "sit_${SUFFIX}" up
+}
+
+
+cleanup() {
+  ip tunnel del "ipip_${SUFFIX}"
+  ip tunnel del "gre_${SUFFIX}"
+  ip tunnel del "sit_${SUFFIX}"
+
+  echo "tunnels after test:"
+  ip tunnel show
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"