xdp: Add devmap_hash map type for looking up devices by hashed index

author Toke Høiland-Jørgensen <toke@redhat.com>

Fri, 26 Jul 2019 16:06:55 +0000 (18:06 +0200)

committer Alexei Starovoitov <ast@kernel.org>

Mon, 29 Jul 2019 20:50:48 +0000 (13:50 -0700)
author Toke Høiland-Jørgensen <toke@redhat.com>
Fri, 26 Jul 2019 16:06:55 +0000 (18:06 +0200)
committer Alexei Starovoitov <ast@kernel.org>
Mon, 29 Jul 2019 20:50:48 +0000 (13:50 -0700)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index bfdb54dd2ad14e5ea36cfad9d0d3af23bfca6096..f9a506147c8a4410b122bb6db3525e1672262af3 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -713,6 +713,7 @@ struct xdp_buff;
  struct sk_buff;
  
  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
  void __dev_map_flush(struct bpf_map *map);
  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
                     struct net_device *dev_rx);
@@ -799,6 +800,12 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
         return NULL;
  }
  
+static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
+                                                            u32 key)
+{
+       return NULL;
+}
+
  static inline void __dev_map_flush(struct bpf_map *map)
  {
  }
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index eec5aeeeaf928bc044e1a2f5602a0ec1e3c577f2..36a9c2325176b91c3f948bb917ed13ce07000d39 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
  #ifdef CONFIG_NET
  BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
  #if defined(CONFIG_BPF_STREAM_PARSER)
  BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h

index 68899fdc985b9d2e5dbc174dc29f9844ebebc4fd..8c8420230a10e794e46fcb37205e9b7a98f48ae4 100644 (file)
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
  #endif /* __DEVMAP_OBJ_TYPE */
  
  #define devmap_ifindex(fwd, map)                               \
-       ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?               \
+       ((map->map_type == BPF_MAP_TYPE_DEVMAP ||               \
+         map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?          \
           ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
  
  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)               \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index e985f07a98ed3346a3d9c4ec4b350d6cf09d9cda..6bbef0c7f5853307b452516e960585a0a5514325 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -134,6 +134,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_QUEUE,
         BPF_MAP_TYPE_STACK,
         BPF_MAP_TYPE_SK_STORAGE,
+       BPF_MAP_TYPE_DEVMAP_HASH,
  };
  
  /* Note that tracing related programs such as
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c

index a0501266bdb8f3ca6689da6081564330e2ec4ab2..9af048a932b5ff034e80b5ebebc1ebac2d7b1f81 100644 (file)
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@
   * notifier hook walks the map we know that new dev references can not be
   * added by the user because core infrastructure ensures dev_get_by_index()
   * calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
   */
  #include <linux/bpf.h>
  #include <net/xdp.h>
@@ -59,6 +65,7 @@ struct xdp_bulk_queue {
  
  struct bpf_dtab_netdev {
         struct net_device *dev; /* must be first member, due to tracepoint */
+       struct hlist_node index_hlist;
         struct bpf_dtab *dtab;
         struct xdp_bulk_queue __percpu *bulkq;
         struct rcu_head rcu;
@@ -70,11 +77,30 @@ struct bpf_dtab {
         struct bpf_dtab_netdev **netdev_map;
         struct list_head __percpu *flush_list;
         struct list_head list;
+
+       /* these are only used for DEVMAP_HASH type maps */
+       struct hlist_head *dev_index_head;
+       spinlock_t index_lock;
+       unsigned int items;
+       u32 n_buckets;
  };
  
  static DEFINE_SPINLOCK(dev_map_lock);
  static LIST_HEAD(dev_map_list);
  
+static struct hlist_head *dev_map_create_hash(unsigned int entries)
+{
+       int i;
+       struct hlist_head *hash;
+
+       hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+       if (hash != NULL)
+               for (i = 0; i < entries; i++)
+                       INIT_HLIST_HEAD(&hash[i]);
+
+       return hash;
+}
+
  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
  {
         int err, cpu;
@@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
         cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
         cost += sizeof(struct list_head) * num_possible_cpus();
  
+       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+               dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
+               if (!dtab->n_buckets) /* Overflow check */
+                       return -EINVAL;
+               cost += sizeof(struct hlist_head) * dtab->n_buckets;
+       }
+
         /* if map size is larger than memlock limit, reject it */
         err = bpf_map_charge_init(&dtab->map.memory, cost);
         if (err)
@@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
         if (!dtab->netdev_map)
                 goto free_percpu;
  
+       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+               dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+               if (!dtab->dev_index_head)
+                       goto free_map_area;
+
+               spin_lock_init(&dtab->index_lock);
+       }
+
         return 0;
  
+free_map_area:
+       bpf_map_area_free(dtab->netdev_map);
  free_percpu:
         free_percpu(dtab->flush_list);
  free_charge:
@@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map)
  
         free_percpu(dtab->flush_list);
         bpf_map_area_free(dtab->netdev_map);
+       kfree(dtab->dev_index_head);
         kfree(dtab);
  }
  
@@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
         return 0;
  }
  
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+                                                   int idx)
+{
+       return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+       struct hlist_head *head = dev_map_index_hash(dtab, key);
+       struct bpf_dtab_netdev *dev;
+
+       hlist_for_each_entry_rcu(dev, head, index_hlist)
+               if (dev->idx == key)
+                       return dev;
+
+       return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+                                   void *next_key)
+{
+       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+       u32 idx, *next = next_key;
+       struct bpf_dtab_netdev *dev, *next_dev;
+       struct hlist_head *head;
+       int i = 0;
+
+       if (!key)
+               goto find_first;
+
+       idx = *(u32 *)key;
+
+       dev = __dev_map_hash_lookup_elem(map, idx);
+       if (!dev)
+               goto find_first;
+
+       next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+                                   struct bpf_dtab_netdev, index_hlist);
+
+       if (next_dev) {
+               *next = next_dev->idx;
+               return 0;
+       }
+
+       i = idx & (dtab->n_buckets - 1);
+       i++;
+
+ find_first:
+       for (; i < dtab->n_buckets; i++) {
+               head = dev_map_index_hash(dtab, i);
+
+               next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+                                           struct bpf_dtab_netdev,
+                                           index_hlist);
+               if (next_dev) {
+                       *next = next_dev->idx;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
  static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
                        bool in_napi_ctx)
  {
@@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
         return dev ? &dev->ifindex : NULL;
  }
  
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+                                                               *(u32 *)key);
+       struct net_device *dev = obj ? obj->dev : NULL;
+
+       return dev ? &dev->ifindex : NULL;
+}
+
  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
  {
         if (dev->dev->netdev_ops->ndo_xdp_xmit) {
@@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
         return 0;
  }
  
+static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+{
+       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+       struct bpf_dtab_netdev *old_dev;
+       int k = *(u32 *)key;
+       unsigned long flags;
+       int ret = -ENOENT;
+
+       spin_lock_irqsave(&dtab->index_lock, flags);
+
+       old_dev = __dev_map_hash_lookup_elem(map, k);
+       if (old_dev) {
+               dtab->items--;
+               hlist_del_init_rcu(&old_dev->index_hlist);
+               call_rcu(&old_dev->rcu, __dev_map_entry_free);
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+       return ret;
+}
+
  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
                                                     struct bpf_dtab *dtab,
                                                     u32 ifindex,
@@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
                                      map, key, value, map_flags);
  }
  
+static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+                                    void *key, void *value, u64 map_flags)
+{
+       struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+       struct bpf_dtab_netdev *dev, *old_dev;
+       u32 ifindex = *(u32 *)value;
+       u32 idx = *(u32 *)key;
+       unsigned long flags;
+
+       if (unlikely(map_flags > BPF_EXIST || !ifindex))
+               return -EINVAL;
+
+       old_dev = __dev_map_hash_lookup_elem(map, idx);
+       if (old_dev && (map_flags & BPF_NOEXIST))
+               return -EEXIST;
+
+       dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
+
+       spin_lock_irqsave(&dtab->index_lock, flags);
+
+       if (old_dev) {
+               hlist_del_rcu(&old_dev->index_hlist);
+       } else {
+               if (dtab->items >= dtab->map.max_entries) {
+                       spin_unlock_irqrestore(&dtab->index_lock, flags);
+                       call_rcu(&dev->rcu, __dev_map_entry_free);
+                       return -E2BIG;
+               }
+               dtab->items++;
+       }
+
+       hlist_add_head_rcu(&dev->index_hlist,
+                          dev_map_index_hash(dtab, idx));
+       spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+       if (old_dev)
+               call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+       return 0;
+}
+
+static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+                                  u64 map_flags)
+{
+       return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+                                        map, key, value, map_flags);
+}
+
  const struct bpf_map_ops dev_map_ops = {
         .map_alloc = dev_map_alloc,
         .map_free = dev_map_free,
@@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = {
         .map_check_btf = map_check_no_btf,
  };
  
+const struct bpf_map_ops dev_map_hash_ops = {
+       .map_alloc = dev_map_alloc,
+       .map_free = dev_map_free,
+       .map_get_next_key = dev_map_hash_get_next_key,
+       .map_lookup_elem = dev_map_hash_lookup_elem,
+       .map_update_elem = dev_map_hash_update_elem,
+       .map_delete_elem = dev_map_hash_delete_elem,
+       .map_check_btf = map_check_no_btf,
+};
+
  static int dev_map_notification(struct notifier_block *notifier,
                                 ulong event, void *ptr)
  {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index 5900cbb966b17adb04538e887b2f2eff658ba823..cef851cd5c36ab0b3bb9dd882e03b2890b6f6963 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3457,6 +3457,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                         goto error;
                 break;
         case BPF_MAP_TYPE_DEVMAP:
+       case BPF_MAP_TYPE_DEVMAP_HASH:
                 if (func_id != BPF_FUNC_redirect_map &&
                     func_id != BPF_FUNC_map_lookup_elem)
                         goto error;
@@ -3539,6 +3540,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                 break;
         case BPF_FUNC_redirect_map:
                 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+                   map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
                     map->map_type != BPF_MAP_TYPE_CPUMAP &&
                     map->map_type != BPF_MAP_TYPE_XSKMAP)
                         goto error;
diff --git a/net/core/filter.c b/net/core/filter.c

index 3961437ccc506eb858365d1a32e2b7cadec0c84e..1eee70f80fbad224fe4eb64b4b9f7eca371da956 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3517,7 +3517,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
         int err;
  
         switch (map->map_type) {
-       case BPF_MAP_TYPE_DEVMAP: {
+       case BPF_MAP_TYPE_DEVMAP:
+       case BPF_MAP_TYPE_DEVMAP_HASH: {
                 struct bpf_dtab_netdev *dst = fwd;
  
                 err = dev_map_enqueue(dst, xdp, dev_rx);
@@ -3554,6 +3555,7 @@ void xdp_do_flush_map(void)
         if (map) {
                 switch (map->map_type) {
                 case BPF_MAP_TYPE_DEVMAP:
+               case BPF_MAP_TYPE_DEVMAP_HASH:
                         __dev_map_flush(map);
                         break;
                 case BPF_MAP_TYPE_CPUMAP:
@@ -3574,6 +3576,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
         switch (map->map_type) {
         case BPF_MAP_TYPE_DEVMAP:
                 return __dev_map_lookup_elem(map, index);
+       case BPF_MAP_TYPE_DEVMAP_HASH:
+               return __dev_map_hash_lookup_elem(map, index);
         case BPF_MAP_TYPE_CPUMAP:
                 return __cpu_map_lookup_elem(map, index);
         case BPF_MAP_TYPE_XSKMAP:
@@ -3655,7 +3659,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
         ri->tgt_value = NULL;
         WRITE_ONCE(ri->map, NULL);
  
-       if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+       if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+           map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                 struct bpf_dtab_netdev *dst = fwd;
  
                 err = dev_map_generic_redirect(dst, skb, xdp_prog);
author	Toke Høiland-Jørgensen <toke@redhat.com>
	Fri, 26 Jul 2019 16:06:55 +0000 (18:06 +0200)
committer	Alexei Starovoitov <ast@kernel.org>
	Mon, 29 Jul 2019 20:50:48 +0000 (13:50 -0700)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_types.h		patch \| blob \| history
include/trace/events/xdp.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/devmap.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
net/core/filter.c		patch \| blob \| history