From c9c7b4b3945c01c2aadf3ef5d9a77c8200db80f1 Mon Sep 17 00:00:00 2001 From: Aaron Goodman Date: Sat, 14 Nov 2020 23:51:38 -0500 Subject: [PATCH] kernel: add netfilter-actual-sk patch Backport of linux kernel commit 46d6c5a to 4.14 kernel. netfilter: use actual socket sk rather than skb sk when routing harder Signed-off-by: Aaron Goodman --- .../373-netfilter_actual_sk.patch | 234 ++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 target/linux/generic/backport-4.14/373-netfilter_actual_sk.patch diff --git a/target/linux/generic/backport-4.14/373-netfilter_actual_sk.patch b/target/linux/generic/backport-4.14/373-netfilter_actual_sk.patch new file mode 100644 index 0000000000..21722ceb23 --- /dev/null +++ b/target/linux/generic/backport-4.14/373-netfilter_actual_sk.patch @@ -0,0 +1,234 @@ +From: "Jason A. Donenfeld" +To: Pablo Neira Ayuso , + netfilter-devel@vger.kernel.org, netdev@vger.kernel.org +Cc: "Jason A. Donenfeld" +Subject: [PATCH nf 2/2] netfilter: use actual socket sk rather than skb sk when routing harder +Date: Thu, 29 Oct 2020 03:56:06 +0100 +Message-ID: <20201029025606.3523771-3-Jason@zx2c4.com> (raw) +In-Reply-To: <20201029025606.3523771-1-Jason@zx2c4.com> + +If netfilter changes the packet mark when mangling, the packet is +rerouted using the route_me_harder set of functions. Prior to this +commit, there's one big difference between route_me_harder and the +ordinary initial routing functions, described in the comment above +__ip_queue_xmit(): + + /* Note: skb->sk can be different from sk, in case of tunnels */ + int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + +That function goes on to correctly make use of sk->sk_bound_dev_if, +rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a +tunnel will receive a packet in ndo_start_xmit with an initial skb->sk. +It will make some transformations to that packet, and then it will send +the encapsulated packet out of a *new* socket. That new socket will +basically always have a different sk_bound_dev_if (otherwise there'd be +a routing loop). So for the purposes of routing the encapsulated packet, +the routing information as it pertains to the socket should come from +that socket's sk, rather than the packet's original skb->sk. For that +reason __ip_queue_xmit() and related functions all do the right thing. + +One might argue that all tunnels should just call skb_orphan(skb) before +transmitting the encapsulated packet into the new socket. But tunnels do +*not* do this -- and this is wisely avoided in skb_scrub_packet() too -- +because features like TSQ rely on skb->destructor() being called when +that buffer space is truely available again. Calling skb_orphan(skb) too +early would result in buffers filling up unnecessarily and accounting +info being all wrong. Instead, additional routing must take into account +the new sk, just as __ip_queue_xmit() notes. + +So, this commit addresses the problem by fishing the correct sk out of +state->sk -- it's already set properly in the call to nf_hook() in +__ip_local_out(), which receives the sk as part of its normal +functionality. So we make sure to plumb state->sk through the various +route_me_harder functions, and then make correct use of it following the +example of __ip_queue_xmit(). + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Jason A. Donenfeld +[aaronjg@stanford.edu: backported to 4.14] +Signed-off-by: Aaron Goodman +--- a/include/linux/netfilter_ipv4.h ++++ b/include/linux/netfilter_ipv4.h +@@ -16,7 +16,7 @@ struct ip_rt_info { + u_int32_t mark; + }; + +-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type); ++int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned addr_type); + + struct nf_queue_entry; + +--- a/include/linux/netfilter_ipv6.h ++++ b/include/linux/netfilter_ipv6.h +@@ -41,7 +41,7 @@ struct nf_ipv6_ops { + }; + + #ifdef CONFIG_NETFILTER +-int ip6_route_me_harder(struct net *net, struct sk_buff *skb); ++int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb); + __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol); + +--- a/net/ipv4/netfilter.c ++++ b/net/ipv4/netfilter.c +@@ -17,17 +17,19 @@ + #include + + /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) ++int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) + { + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; + struct flowi4 fl4 = {}; + __be32 saddr = iph->saddr; +- const struct sock *sk = skb_to_full_sk(skb); +- __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0; ++ __u8 flags; + struct net_device *dev = skb_dst(skb)->dev; + unsigned int hh_len; + ++ sk = sk_to_full_sk(sk); ++ flags = sk ? inet_sk_flowi_flags(sk) : 0; ++ + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type_dev_table(net, dev, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) +@@ -91,7 +93,7 @@ int nf_ip_reroute(struct sk_buff *skb, c + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) +- return ip_route_me_harder(entry->state.net, skb, ++ return ip_route_me_harder(entry->state.net, entry->state.sk, skb, + RTN_UNSPEC); + } + return 0; +--- a/net/ipv4/netfilter/ipt_SYNPROXY.c ++++ b/net/ipv4/netfilter/ipt_SYNPROXY.c +@@ -53,7 +53,7 @@ synproxy_send_tcp(struct net *net, + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); +- if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) ++ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { +--- a/net/ipv4/netfilter/iptable_mangle.c ++++ b/net/ipv4/netfilter/iptable_mangle.c +@@ -65,7 +65,7 @@ ipt_mangle_out(struct sk_buff *skb, cons + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) { +- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); ++ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c ++++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +@@ -397,7 +397,7 @@ nf_nat_ipv4_local_fn(void *priv, struct + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { +- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); ++ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/ipv4/netfilter/nf_reject_ipv4.c ++++ b/net/ipv4/netfilter/nf_reject_ipv4.c +@@ -129,7 +129,7 @@ void nf_send_reset(struct net *net, stru + ip4_dst_hoplimit(skb_dst(nskb))); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + +- if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) ++ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) + goto free_nskb; + + niph = ip_hdr(nskb); +--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c ++++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c +@@ -50,7 +50,7 @@ static unsigned int nf_route_table_hook( + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) { +- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); ++ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/ipv6/netfilter.c ++++ b/net/ipv6/netfilter.c +@@ -18,10 +18,10 @@ + #include + #include + +-int ip6_route_me_harder(struct net *net, struct sk_buff *skb) ++int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb) + { + const struct ipv6hdr *iph = ipv6_hdr(skb); +- struct sock *sk = sk_to_full_sk(skb->sk); ++ struct sock *sk = sk_to_full_sk(sk_partial); + unsigned int hh_len; + struct dst_entry *dst; + int strict = (ipv6_addr_type(&iph->daddr) & +@@ -82,7 +82,7 @@ static int nf_ip6_reroute(struct sk_buff + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || + skb->mark != rt_info->mark) +- return ip6_route_me_harder(entry->state.net, skb); ++ return ip6_route_me_harder(entry->state.net, entry->state.sk, skb); + } + return 0; + } +--- a/net/ipv6/netfilter/ip6table_mangle.c ++++ b/net/ipv6/netfilter/ip6table_mangle.c +@@ -60,7 +60,7 @@ ip6t_mangle_out(struct sk_buff *skb, con + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { +- err = ip6_route_me_harder(state->net, skb); ++ err = ip6_route_me_harder(state->net, state->sk, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c ++++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +@@ -414,7 +414,7 @@ nf_nat_ipv6_local_fn(void *priv, struct + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { +- err = ip6_route_me_harder(state->net, skb); ++ err = ip6_route_me_harder(state->net, state->sk, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c ++++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c +@@ -52,7 +52,7 @@ static unsigned int nf_route_table_hook( + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { +- err = ip6_route_me_harder(state->net, skb); ++ err = ip6_route_me_harder(state->net, state->sk, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +--- a/net/netfilter/ipvs/ip_vs_core.c ++++ b/net/netfilter/ipvs/ip_vs_core.c +@@ -713,12 +713,12 @@ static int ip_vs_route_me_harder(struct + struct dst_entry *dst = skb_dst(skb); + + if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && +- ip6_route_me_harder(ipvs->net, skb) != 0) ++ ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) + return 1; + } else + #endif + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && +- ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0) ++ ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) + return 1; + + return 0; -- 2.30.2