From f34436a430920bdfdba575b509d994c619f5101d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 10:26:53 -0700 Subject: [PATCH] net/ipv6: Simplify route replace and appending into multipath route Bring consistency to ipv6 route replace and append semantics. Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases: 1. can not replace a route with a reject route. Existing code appends a new route instead of replacing the existing one. 2. can not have a multipath route where a leg uses a dev only nexthop Existing use cases affected by this change: 1. adding a route with existing prefix and metric using NLM_F_CREATE without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls 'prepend'). Existing code auto-determines that the new nexthop can be appended to an existing route to create a multipath route. This change breaks that by requiring the APPEND flag for the new route to be added to an existing one. Instead the prepend just adds another route entry. 2. route replace. Existing code replaces first matching multipath route if new route is multipath capable and fallback to first matching non-ECMP route (reject or dev only route) in case one isn't available. New behavior replaces first matching route. (Thanks to Ido for spotting this one) Note: Newer iproute2 is needed to display multipath routes with a dev-only nexthop. This is due to a bug in iproute2 and parsing nexthops. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 -- net/ipv6/ip6_fib.c | 157 ++++++++++++++++++---------------------- net/ipv6/route.c | 3 +- 3 files changed, 73 insertions(+), 93 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4cf1ef935ed9..9e4d0f0aeb6d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) -{ - return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d1dc6017f5a6..f9132a6de917 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - struct fib6_info *iter = NULL; + struct fib6_info *iter = NULL, *match = NULL; struct fib6_info __rcu **ins; - struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int append = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; - bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) + if (append) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { - found++; - break; - } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; - goto next_iter; + found++; + break; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - /* If we have the same destination and the same metric, - * but not the same gateway, then the route we try to - * add is sibling to this route, increment our counter - * of siblings, and later we will add our route to the - * list. - * Only static routes (which don't have flag - * RTF_EXPIRES) are used for ECMPv6. - * - * To avoid long list, we only had siblings if the - * route have a gateway. - */ - if (rt_can_ecmp && - rt6_qualify_for_ecmp(iter)) - rt->fib6_nsiblings++; + + /* first route that matches */ + if (!match) + match = iter; } if (iter->fib6_metric > rt->fib6_metric) break; -next_iter: ins = &iter->fib6_next; } - if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ - ins = fallback_ins; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - found++; - } - /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->fib6_nsiblings) { - unsigned int fib6_nsiblings; + if (append && match) { struct fib6_info *sibling, *temp_sibling; - /* Find the first route that have the same metric */ - sibling = leaf; - while (sibling) { - if (sibling->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); - break; - } - sibling = rcu_dereference_protected(sibling->fib6_next, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + if (rt->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append a REJECT route"); + return -EINVAL; + } else if (match->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append to a REJECT route"); + return -EINVAL; } + rt->fib6_nsiblings = match->fib6_nsiblings; + list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); + match->fib6_nsiblings++; + /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->fib6_siblings, fib6_siblings) { + &match->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); - fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); } - BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); - rt6_multipath_rebalance(temp_sibling); + + rt6_multipath_rebalance(match); } /* * insert node */ if (!replace) { + enum fib_event_type event; + if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD; + err = call_fib6_entry_notifiers(info->nl_net, event, rt, + extack); if (err) return err; @@ -1086,7 +1062,7 @@ add: } } else { - int nsiblings; + struct fib6_info *tmp; if (!found) { if (add) @@ -1101,48 +1077,57 @@ add: if (err) return err; + /* if route being replaced has siblings, set tmp to + * last one, otherwise tmp is current route. this is + * used to set fib6_next for new route + */ + if (iter->fib6_nsiblings) + tmp = list_last_entry(&iter->fib6_siblings, + struct fib6_info, + fib6_siblings); + else + tmp = iter; + + /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = iter->fib6_next; + rt->fib6_next = tmp->fib6_next; rcu_assign_pointer(*ins, rt); + if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->fib6_nsiblings; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - if (nsiblings) { + /* delete old route */ + rt = iter; + + if (rt->fib6_nsiblings) { + struct fib6_info *tmp; + /* Replacing an ECMP route, remove all siblings */ - ins = &rt->fib6_next; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - while (iter) { - if (iter->fib6_metric > rt->fib6_metric) - break; - if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->fib6_next; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; - } else { - ins = &iter->fib6_next; - } - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, + fib6_siblings) { + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + + rt->fib6_nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } - WARN_ON(nsiblings != 0); } + + WARN_ON(rt->fib6_nsiblings != 0); + + rt->fib6_node = NULL; + fib6_purge_rt(rt, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == rt) + fn->rr_ptr = NULL; + fib6_info_release(rt); } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cc24ed3bc334..bcb8785c0451 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(iter)) + iter->fib6_nsiblings) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } -- 2.30.2