net/ipv6: Simplify route replace and appending into multipath route
authorDavid Ahern <dsahern@gmail.com>
Mon, 21 May 2018 17:26:53 +0000 (10:26 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 22 May 2018 18:44:18 +0000 (14:44 -0400)
Bring consistency to ipv6 route replace and append semantics.

Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases:
1. can not replace a route with a reject route. Existing code appends
   a new route instead of replacing the existing one.

2. can not have a multipath route where a leg uses a dev only nexthop

Existing use cases affected by this change:
1. adding a route with existing prefix and metric using NLM_F_CREATE
   without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls
   'prepend'). Existing code auto-determines that the new nexthop can
   be appended to an existing route to create a multipath route. This
   change breaks that by requiring the APPEND flag for the new route
   to be added to an existing one. Instead the prepend just adds another
   route entry.

2. route replace. Existing code replaces first matching multipath route
   if new route is multipath capable and fallback to first matching
   non-ECMP route (reject or dev only route) in case one isn't available.
   New behavior replaces first matching route. (Thanks to Ido for spotting
   this one)

Note: Newer iproute2 is needed to display multipath routes with a dev-only
      nexthop. This is due to a bug in iproute2 and parsing nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/ip6_route.h
net/ipv6/ip6_fib.c
net/ipv6/route.c

index 4cf1ef935ed9e678f2551879dda9df0ce08b1758..9e4d0f0aeb6de79b1cca39a91cc93b804ff03776 100644 (file)
@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
-{
-       return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-              RTF_GATEWAY;
-}
-
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
index d1dc6017f5a69d6ceb4fcade98ca931567925f84..f9132a6de917449e871a9d0b0c07b7bf91145d1a 100644 (file)
@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
-       struct fib6_info *iter = NULL;
+       struct fib6_info *iter = NULL, *match = NULL;
        struct fib6_info __rcu **ins;
-       struct fib6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+       int append = (info->nlh &&
+                      (info->nlh->nlmsg_flags & NLM_F_APPEND));
        int add = (!info->nlh ||
                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
        int found = 0;
-       bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        u16 nlflags = NLM_F_EXCL;
        int err;
 
-       if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+       if (append)
                nlflags |= NLM_F_APPEND;
 
        ins = &fn->leaf;
@@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
                        nlflags &= ~NLM_F_EXCL;
                        if (replace) {
-                               if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
-                                       found++;
-                                       break;
-                               }
-                               if (rt_can_ecmp)
-                                       fallback_ins = fallback_ins ?: ins;
-                               goto next_iter;
+                               found++;
+                               break;
                        }
 
                        if (rt6_duplicate_nexthop(iter, rt)) {
@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
                                fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
                                return -EEXIST;
                        }
-                       /* If we have the same destination and the same metric,
-                        * but not the same gateway, then the route we try to
-                        * add is sibling to this route, increment our counter
-                        * of siblings, and later we will add our route to the
-                        * list.
-                        * Only static routes (which don't have flag
-                        * RTF_EXPIRES) are used for ECMPv6.
-                        *
-                        * To avoid long list, we only had siblings if the
-                        * route have a gateway.
-                        */
-                       if (rt_can_ecmp &&
-                           rt6_qualify_for_ecmp(iter))
-                               rt->fib6_nsiblings++;
+
+                       /* first route that matches */
+                       if (!match)
+                               match = iter;
                }
 
                if (iter->fib6_metric > rt->fib6_metric)
                        break;
 
-next_iter:
                ins = &iter->fib6_next;
        }
 
-       if (fallback_ins && !found) {
-               /* No ECMP-able route found, replace first non-ECMP one */
-               ins = fallback_ins;
-               iter = rcu_dereference_protected(*ins,
-                                   lockdep_is_held(&rt->fib6_table->tb6_lock));
-               found++;
-       }
-
        /* Reset round-robin state, if necessary */
        if (ins == &fn->leaf)
                fn->rr_ptr = NULL;
 
        /* Link this route to others same route. */
-       if (rt->fib6_nsiblings) {
-               unsigned int fib6_nsiblings;
+       if (append && match) {
                struct fib6_info *sibling, *temp_sibling;
 
-               /* Find the first route that have the same metric */
-               sibling = leaf;
-               while (sibling) {
-                       if (sibling->fib6_metric == rt->fib6_metric &&
-                           rt6_qualify_for_ecmp(sibling)) {
-                               list_add_tail(&rt->fib6_siblings,
-                                             &sibling->fib6_siblings);
-                               break;
-                       }
-                       sibling = rcu_dereference_protected(sibling->fib6_next,
-                                   lockdep_is_held(&rt->fib6_table->tb6_lock));
+               if (rt->fib6_flags & RTF_REJECT) {
+                       NL_SET_ERR_MSG(extack,
+                                      "Can not append a REJECT route");
+                       return -EINVAL;
+               } else if (match->fib6_flags & RTF_REJECT) {
+                       NL_SET_ERR_MSG(extack,
+                                      "Can not append to a REJECT route");
+                       return -EINVAL;
                }
+               rt->fib6_nsiblings = match->fib6_nsiblings;
+               list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
+               match->fib6_nsiblings++;
+
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
                 */
-               fib6_nsiblings = 0;
                list_for_each_entry_safe(sibling, temp_sibling,
-                                        &rt->fib6_siblings, fib6_siblings) {
+                                        &match->fib6_siblings, fib6_siblings) {
                        sibling->fib6_nsiblings++;
-                       BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
-                       fib6_nsiblings++;
+                       BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
                }
-               BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
-               rt6_multipath_rebalance(temp_sibling);
+
+               rt6_multipath_rebalance(match);
        }
 
        /*
         *      insert node
         */
        if (!replace) {
+               enum fib_event_type event;
+
                if (!add)
                        pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
                nlflags |= NLM_F_CREATE;
 
-               err = call_fib6_entry_notifiers(info->nl_net,
-                                               FIB_EVENT_ENTRY_ADD,
-                                               rt, extack);
+               event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
+               err = call_fib6_entry_notifiers(info->nl_net, event, rt,
+                                               extack);
                if (err)
                        return err;
 
@@ -1086,7 +1062,7 @@ add:
                }
 
        } else {
-               int nsiblings;
+               struct fib6_info *tmp;
 
                if (!found) {
                        if (add)
@@ -1101,48 +1077,57 @@ add:
                if (err)
                        return err;
 
+               /* if route being replaced has siblings, set tmp to
+                * last one, otherwise tmp is current route. this is
+                * used to set fib6_next for new route
+                */
+               if (iter->fib6_nsiblings)
+                       tmp = list_last_entry(&iter->fib6_siblings,
+                                             struct fib6_info,
+                                             fib6_siblings);
+               else
+                       tmp = iter;
+
+               /* insert new route */
                atomic_inc(&rt->fib6_ref);
                rcu_assign_pointer(rt->fib6_node, fn);
-               rt->fib6_next = iter->fib6_next;
+               rt->fib6_next = tmp->fib6_next;
                rcu_assign_pointer(*ins, rt);
+
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }
-               nsiblings = iter->fib6_nsiblings;
-               iter->fib6_node = NULL;
-               fib6_purge_rt(iter, fn, info->nl_net);
-               if (rcu_access_pointer(fn->rr_ptr) == iter)
-                       fn->rr_ptr = NULL;
-               fib6_info_release(iter);
 
-               if (nsiblings) {
+               /* delete old route */
+               rt = iter;
+
+               if (rt->fib6_nsiblings) {
+                       struct fib6_info *tmp;
+
                        /* Replacing an ECMP route, remove all siblings */
-                       ins = &rt->fib6_next;
-                       iter = rcu_dereference_protected(*ins,
-                                   lockdep_is_held(&rt->fib6_table->tb6_lock));
-                       while (iter) {
-                               if (iter->fib6_metric > rt->fib6_metric)
-                                       break;
-                               if (rt6_qualify_for_ecmp(iter)) {
-                                       *ins = iter->fib6_next;
-                                       iter->fib6_node = NULL;
-                                       fib6_purge_rt(iter, fn, info->nl_net);
-                                       if (rcu_access_pointer(fn->rr_ptr) == iter)
-                                               fn->rr_ptr = NULL;
-                                       fib6_info_release(iter);
-                                       nsiblings--;
-                                       info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
-                               } else {
-                                       ins = &iter->fib6_next;
-                               }
-                               iter = rcu_dereference_protected(*ins,
-                                       lockdep_is_held(&rt->fib6_table->tb6_lock));
+                       list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
+                                                fib6_siblings) {
+                               iter->fib6_node = NULL;
+                               fib6_purge_rt(iter, fn, info->nl_net);
+                               if (rcu_access_pointer(fn->rr_ptr) == iter)
+                                       fn->rr_ptr = NULL;
+                               fib6_info_release(iter);
+
+                               rt->fib6_nsiblings--;
+                               info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                        }
-                       WARN_ON(nsiblings != 0);
                }
+
+               WARN_ON(rt->fib6_nsiblings != 0);
+
+               rt->fib6_node = NULL;
+               fib6_purge_rt(rt, fn, info->nl_net);
+               if (rcu_access_pointer(fn->rr_ptr) == rt)
+                       fn->rr_ptr = NULL;
+               fib6_info_release(rt);
        }
 
        return 0;
index cc24ed3bc3349d3116ae87b11adc7bb2ec80b3a9..bcb8785c0451c418ae32b9ca1a68db52fe2d3fe0 100644 (file)
@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
-                   rt6_qualify_for_ecmp(iter))
+                   iter->fib6_nsiblings)
                        return iter;
                iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
                 */
                cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                     NLM_F_REPLACE);
+               cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
                nhn++;
        }