ipv6: Calculate hash thresholds for IPv6 nexthops
authorIdo Schimmel <idosch@mellanox.com>
Tue, 9 Jan 2018 14:40:25 +0000 (16:40 +0200)
committerDavid S. Miller <davem@davemloft.net>
Wed, 10 Jan 2018 20:14:44 +0000 (15:14 -0500)
Before we convert IPv6 to use hash-threshold instead of modulo-N, we
first need each nexthop to store its region boundary in the hash
function's output space.

The boundary is calculated by dividing the output space equally between
the different active nexthops. That is, nexthops that are not dead or
linkdown.

The boundaries are rebalanced whenever a nexthop is added or removed to
a multipath route and whenever a nexthop becomes active or inactive.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/ip6_fib.h
include/net/ip6_route.h
net/ipv6/ip6_fib.c
net/ipv6/route.c

index ddf53dd1e9484db9a75e02b30cacf9fc47c7a04f..97cd05d87780707a6f33b65e42241da47464215a 100644 (file)
@@ -149,6 +149,7 @@ struct rt6_info {
         */
        struct list_head                rt6i_siblings;
        unsigned int                    rt6i_nsiblings;
+       atomic_t                        rt6i_nh_upper_bound;
 
        atomic_t                        rt6i_ref;
 
index 34cd3b0c6dedd6e54799f30a1f551b7a13fd7bbb..27d23a65f3cd0be2255859614690151e2d01b352 100644 (file)
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+{
+       return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+              RTF_GATEWAY;
+}
+
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
+void rt6_multipath_rebalance(struct rt6_info *rt);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
index b5f19703fca64892883389fc97c92116b9ed4b10..e31118f417b401a3ff01996f73c4aba325289fe9 100644 (file)
@@ -796,12 +796,6 @@ insert_above:
        return ln;
 }
 
-static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
-{
-       return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-              RTF_GATEWAY;
-}
-
 static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
 {
        int i;
@@ -991,6 +985,7 @@ next_iter:
                        rt6i_nsiblings++;
                }
                BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+               rt6_multipath_rebalance(temp_sibling);
        }
 
        /*
@@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
                        sibling->rt6i_nsiblings--;
                rt->rt6i_nsiblings = 0;
                list_del_init(&rt->rt6i_siblings);
+               rt6_multipath_rebalance(next_sibling);
        }
 
        /* Adjust walkers */
index 1054b059747f9e2f9f60202bd4937f088e745c15..ced2c9bed10b8e73d587efae954e53eb51fba37e 100644 (file)
@@ -3481,6 +3481,99 @@ struct arg_netdev_event {
        };
 };
 
+static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+{
+       struct rt6_info *iter;
+       struct fib6_node *fn;
+
+       fn = rcu_dereference_protected(rt->rt6i_node,
+                       lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       iter = rcu_dereference_protected(fn->leaf,
+                       lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       while (iter) {
+               if (iter->rt6i_metric == rt->rt6i_metric &&
+                   rt6_qualify_for_ecmp(iter))
+                       return iter;
+               iter = rcu_dereference_protected(iter->rt6_next,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       }
+
+       return NULL;
+}
+
+static bool rt6_is_dead(const struct rt6_info *rt)
+{
+       if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
+           (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+            rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+               return true;
+
+       return false;
+}
+
+static int rt6_multipath_total_weight(const struct rt6_info *rt)
+{
+       struct rt6_info *iter;
+       int total = 0;
+
+       if (!rt6_is_dead(rt))
+               total++;
+
+       list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+               if (!rt6_is_dead(iter))
+                       total++;
+       }
+
+       return total;
+}
+
+static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+{
+       int upper_bound = -1;
+
+       if (!rt6_is_dead(rt)) {
+               (*weight)++;
+               upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
+                                                   total) - 1;
+       }
+       atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+}
+
+static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+{
+       struct rt6_info *iter;
+       int weight = 0;
+
+       rt6_upper_bound_set(rt, &weight, total);
+
+       list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+               rt6_upper_bound_set(iter, &weight, total);
+}
+
+void rt6_multipath_rebalance(struct rt6_info *rt)
+{
+       struct rt6_info *first;
+       int total;
+
+       /* In case the entire multipath route was marked for flushing,
+        * then there is no need to rebalance upon the removal of every
+        * sibling route.
+        */
+       if (!rt->rt6i_nsiblings || rt->should_flush)
+               return;
+
+       /* During lookup routes are evaluated in order, so we need to
+        * make sure upper bounds are assigned from the first sibling
+        * onwards.
+        */
+       first = rt6_multipath_first_sibling(rt);
+       if (WARN_ON_ONCE(!first))
+               return;
+
+       total = rt6_multipath_total_weight(first);
+       rt6_multipath_upper_bound_set(first, total);
+}
+
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
        const struct arg_netdev_event *arg = p_arg;
@@ -3489,6 +3582,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
        if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
                rt->rt6i_nh_flags &= ~arg->nh_flags;
                fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+               rt6_multipath_rebalance(rt);
        }
 
        return 0;
@@ -3588,6 +3682,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
                        rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
                                                   RTNH_F_LINKDOWN);
                        fib6_update_sernum(rt);
+                       rt6_multipath_rebalance(rt);
                }
                return -2;
        case NETDEV_CHANGE:
@@ -3595,6 +3690,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
                    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
                        break;
                rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+               rt6_multipath_rebalance(rt);
                break;
        }