netfilter: nf_conntrack: add IPS_OFFLOAD status bit
authorPablo Neira Ayuso <pablo@netfilter.org>
Sun, 7 Jan 2018 00:03:56 +0000 (01:03 +0100)
committerPablo Neira Ayuso <pablo@netfilter.org>
Mon, 8 Jan 2018 17:11:05 +0000 (18:11 +0100)
This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.

 # cat /proc/net/nf_conntrack
 ipv4     2 tcp      6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2

Note the [OFFLOAD] tag in the listing.

The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().

Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
include/uapi/linux/netfilter/nf_conntrack_common.h
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_standalone.c

index 3fea7709a4412c5e3c7f94daed08576b55bf3bc2..fc8c15a24a433c77e4a65c8a49536fd2f0129f90 100644 (file)
@@ -101,12 +101,16 @@ enum ip_conntrack_status {
        IPS_HELPER_BIT = 13,
        IPS_HELPER = (1 << IPS_HELPER_BIT),
 
+       /* Conntrack has been offloaded to flow table. */
+       IPS_OFFLOAD_BIT = 14,
+       IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
+
        /* Be careful here, modifying these bits can make things messy,
         * so don't let users modify them directly.
         */
        IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
                                 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
-                                IPS_SEQ_ADJUST | IPS_TEMPLATE),
+                                IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
 
        __IPS_MAX_BIT = 14,
 };
index 85f643c1e227c5a70c3b3038baad981e88dca880..6a64d528d0765ab5646211344350e23ea08511e5 100644 (file)
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(struct net *net,
        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
                tmp = nf_ct_tuplehash_to_ctrack(h);
 
+               if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+                       continue;
+
                if (nf_ct_is_expired(tmp)) {
                        nf_ct_gc_expired(tmp);
                        continue;
@@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
        return false;
 }
 
+#define        DAY     (86400 * HZ)
+
+/* Set an arbitrary timeout large enough not to ever expire, this save
+ * us a check for the IPS_OFFLOAD_BIT from the packet path via
+ * nf_ct_is_expired().
+ */
+static void nf_ct_offload_timeout(struct nf_conn *ct)
+{
+       if (nf_ct_expires(ct) < DAY / 2)
+               ct->timeout = nfct_time_stamp + DAY;
+}
+
 static void gc_worker(struct work_struct *work)
 {
        unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
@@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct *work)
                        tmp = nf_ct_tuplehash_to_ctrack(h);
 
                        scanned++;
+                       if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
+                               nf_ct_offload_timeout(tmp);
+                               continue;
+                       }
+
                        if (nf_ct_is_expired(tmp)) {
                                nf_ct_gc_expired(tmp);
                                expired_count++;
index 316bbdc4a158bbba065e340c587afb3e645e2ef9..7c7921a53b13c1540f0d78262ba9118b21d6d9b9 100644 (file)
@@ -1110,6 +1110,14 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
                                    .len = NF_CT_LABELS_MAX_SIZE },
 };
 
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
+{
+       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+               return 0;
+
+       return ctnetlink_filter_match(ct, data);
+}
+
 static int ctnetlink_flush_conntrack(struct net *net,
                                     const struct nlattr * const cda[],
                                     u32 portid, int report)
@@ -1122,7 +1130,7 @@ static int ctnetlink_flush_conntrack(struct net *net,
                        return PTR_ERR(filter);
        }
 
-       nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
+       nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
                                  portid, report);
        kfree(filter);
 
@@ -1168,6 +1176,11 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
 
        ct = nf_ct_tuplehash_to_ctrack(h);
 
+       if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+               nf_ct_put(ct);
+               return -EBUSY;
+       }
+
        if (cda[CTA_ID]) {
                u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
                if (id != (u32)(unsigned long)ct) {
index 684cc29010a0c7714ce5056bc184ea9c2596a9b5..e97cdc1cf98c2618dffe838c4c7ef395e03ed8e0 100644 (file)
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
+       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+               return;
+
        seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 }
 #endif
index 5a101caa3e1279058970882ab1294ef5a7434f8c..46d32baad095f3c444212d9f9db596aa6edee629 100644 (file)
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
        WARN_ON(!l4proto);
 
        ret = -ENOSPC;
-       seq_printf(s, "%-8s %u %-8s %u %ld ",
+       seq_printf(s, "%-8s %u %-8s %u ",
                   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
-                  l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
-                  nf_ct_expires(ct)  / HZ);
+                  l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
+
+       if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
+               seq_printf(s, "%ld ", nf_ct_expires(ct)  / HZ);
 
        if (l4proto->print_conntrack)
                l4proto->print_conntrack(s, ct);
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
        if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
                goto release;
 
-       if (test_bit(IPS_ASSURED_BIT, &ct->status))
+       if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+               seq_puts(s, "[OFFLOAD] ");
+       else if (test_bit(IPS_ASSURED_BIT, &ct->status))
                seq_puts(s, "[ASSURED] ");
 
        if (seq_has_overflowed(s))