bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock
authorMartin KaFai Lau <kafai@fb.com>
Sun, 10 Feb 2019 07:22:24 +0000 (23:22 -0800)
committerAlexei Starovoitov <ast@kernel.org>
Mon, 11 Feb 2019 03:46:17 +0000 (19:46 -0800)
This patch adds a helper function BPF_FUNC_tcp_sock and it
is currently available for cg_skb and sched_(cls|act):

struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk);

int cg_skb_foo(struct __sk_buff *skb) {
struct bpf_tcp_sock *tp;
struct bpf_sock *sk;
__u32 snd_cwnd;

sk = skb->sk;
if (!sk)
return 1;

tp = bpf_tcp_sock(sk);
if (!tp)
return 1;

snd_cwnd = tp->snd_cwnd;
/* ... */

return 1;
}

A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide
read-only access.  bpf_tcp_sock has all the existing tcp_sock's fields
that has already been exposed by the bpf_sock_ops.
i.e. no new tcp_sock's fields are exposed in bpf.h.

This helper returns a pointer to the tcp_sock.  If it is not a tcp_sock
or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it
returns NULL.  Hence, the caller needs to check for NULL before
accessing it.

The current use case is to expose members from tcp_sock
to allow a cg_skb_bpf_prog to provide per cgroup traffic
policing/shaping.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
include/linux/bpf.h
include/uapi/linux/bpf.h
kernel/bpf/verifier.c
net/core/filter.c

index a60463b45b542e35f155187ec4a1b4181d87687e..7f58828755fdad9d170b5c2ebee4f55a6638f2a9 100644 (file)
@@ -204,6 +204,7 @@ enum bpf_return_type {
        RET_PTR_TO_MAP_VALUE,           /* returns a pointer to map elem value */
        RET_PTR_TO_MAP_VALUE_OR_NULL,   /* returns a pointer to map elem value or NULL */
        RET_PTR_TO_SOCKET_OR_NULL,      /* returns a pointer to a socket or NULL */
+       RET_PTR_TO_TCP_SOCK_OR_NULL,    /* returns a pointer to a tcp_sock or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -259,6 +260,8 @@ enum bpf_reg_type {
        PTR_TO_SOCKET_OR_NULL,   /* reg points to struct bpf_sock or NULL */
        PTR_TO_SOCK_COMMON,      /* reg points to sock_common */
        PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
+       PTR_TO_TCP_SOCK,         /* reg points to struct tcp_sock */
+       PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -956,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif
 
+#ifdef CONFIG_INET
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+                                 struct bpf_insn_access_aux *info);
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+                                   const struct bpf_insn *si,
+                                   struct bpf_insn *insn_buf,
+                                   struct bpf_prog *prog,
+                                   u32 *target_size);
+#else
+static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
+                                               enum bpf_access_type type,
+                                               struct bpf_insn_access_aux *info)
+{
+       return false;
+}
+
+static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+                                                 const struct bpf_insn *si,
+                                                 struct bpf_insn *insn_buf,
+                                                 struct bpf_prog *prog,
+                                                 u32 *target_size)
+{
+       return 0;
+}
+#endif /* CONFIG_INET */
+
 #endif /* _LINUX_BPF_H */
index d8f91777c5b6de83fa0f1ba558b49027c81db4a3..25c8c0e62ecf69dc59b21ed245404e5bc49b1ad3 100644 (file)
@@ -2337,6 +2337,15 @@ union bpf_attr {
  *     Return
  *             A **struct bpf_sock** pointer on success, or NULL in
  *             case of failure.
+ *
+ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
+ *     Description
+ *             This helper gets a **struct bpf_tcp_sock** pointer from a
+ *             **struct bpf_sock** pointer.
+ *
+ *     Return
+ *             A **struct bpf_tcp_sock** pointer on success, or NULL in
+ *             case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -2434,7 +2443,8 @@ union bpf_attr {
        FN(rc_pointer_rel),             \
        FN(spin_lock),                  \
        FN(spin_unlock),                \
-       FN(sk_fullsock),
+       FN(sk_fullsock),                \
+       FN(tcp_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2616,6 +2626,45 @@ struct bpf_sock {
        __u32 state;
 };
 
+struct bpf_tcp_sock {
+       __u32 snd_cwnd;         /* Sending congestion window            */
+       __u32 srtt_us;          /* smoothed round trip time << 3 in usecs */
+       __u32 rtt_min;
+       __u32 snd_ssthresh;     /* Slow start size threshold            */
+       __u32 rcv_nxt;          /* What we want to receive next         */
+       __u32 snd_nxt;          /* Next sequence we send                */
+       __u32 snd_una;          /* First byte we want an ack for        */
+       __u32 mss_cache;        /* Cached effective mss, not including SACKS */
+       __u32 ecn_flags;        /* ECN status bits.                     */
+       __u32 rate_delivered;   /* saved rate sample: packets delivered */
+       __u32 rate_interval_us; /* saved rate sample: time elapsed */
+       __u32 packets_out;      /* Packets which are "in flight"        */
+       __u32 retrans_out;      /* Retransmitted packets out            */
+       __u32 total_retrans;    /* Total retransmits for entire connection */
+       __u32 segs_in;          /* RFC4898 tcpEStatsPerfSegsIn
+                                * total number of segments in.
+                                */
+       __u32 data_segs_in;     /* RFC4898 tcpEStatsPerfDataSegsIn
+                                * total number of data segments in.
+                                */
+       __u32 segs_out;         /* RFC4898 tcpEStatsPerfSegsOut
+                                * The total number of segments sent.
+                                */
+       __u32 data_segs_out;    /* RFC4898 tcpEStatsPerfDataSegsOut
+                                * total number of data segments sent.
+                                */
+       __u32 lost_out;         /* Lost packets                 */
+       __u32 sacked_out;       /* SACK'd packets                       */
+       __u64 bytes_received;   /* RFC4898 tcpEStatsAppHCThruOctetsReceived
+                                * sum(delta(rcv_nxt)), or how many bytes
+                                * were acked.
+                                */
+       __u64 bytes_acked;      /* RFC4898 tcpEStatsAppHCThruOctetsAcked
+                                * sum(delta(snd_una)), or how many bytes
+                                * were acked.
+                                */
+};
+
 struct bpf_sock_tuple {
        union {
                struct {
index b755d55a379128e2bb857f3c699dd7bad0e04a5e..1b9496c413833ad1f88c98eeee387f349cf241e0 100644 (file)
@@ -334,14 +334,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 static bool type_is_sk_pointer(enum bpf_reg_type type)
 {
        return type == PTR_TO_SOCKET ||
-               type == PTR_TO_SOCK_COMMON;
+               type == PTR_TO_SOCK_COMMON ||
+               type == PTR_TO_TCP_SOCK;
 }
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
        return type == PTR_TO_MAP_VALUE_OR_NULL ||
               type == PTR_TO_SOCKET_OR_NULL ||
-              type == PTR_TO_SOCK_COMMON_OR_NULL;
+              type == PTR_TO_SOCK_COMMON_OR_NULL ||
+              type == PTR_TO_TCP_SOCK_OR_NULL;
 }
 
 static bool type_is_refcounted(enum bpf_reg_type type)
@@ -407,6 +409,8 @@ static const char * const reg_type_str[] = {
        [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
        [PTR_TO_SOCK_COMMON]    = "sock_common",
        [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+       [PTR_TO_TCP_SOCK]       = "tcp_sock",
+       [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
 };
 
 static char slot_type_char[] = {
@@ -1209,6 +1213,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        case PTR_TO_SOCKET_OR_NULL:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+       case PTR_TO_TCP_SOCK_OR_NULL:
                return true;
        default:
                return false;
@@ -1662,6 +1668,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
        case PTR_TO_SOCKET:
                valid = bpf_sock_is_valid_access(off, size, t, &info);
                break;
+       case PTR_TO_TCP_SOCK:
+               valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+               break;
        default:
                valid = false;
        }
@@ -1823,6 +1832,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
        case PTR_TO_SOCK_COMMON:
                pointer_desc = "sock_common ";
                break;
+       case PTR_TO_TCP_SOCK:
+               pointer_desc = "tcp_sock ";
+               break;
        default:
                break;
        }
@@ -3148,6 +3160,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                        /* For mark_ptr_or_null_reg() */
                        regs[BPF_REG_0].id = ++env->id_gen;
                }
+       } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+               regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+               regs[BPF_REG_0].id = ++env->id_gen;
        } else {
                verbose(env, "unknown return type %d of func %s#%d\n",
                        fn->ret_type, func_id_name(func_id), func_id);
@@ -3409,6 +3425,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case PTR_TO_SOCKET_OR_NULL:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+       case PTR_TO_TCP_SOCK_OR_NULL:
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str[ptr_reg->type]);
                return -EACCES;
@@ -4644,6 +4662,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                        reg->type = PTR_TO_SOCKET;
                } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
                        reg->type = PTR_TO_SOCK_COMMON;
+               } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+                       reg->type = PTR_TO_TCP_SOCK;
                }
                if (is_null || !(reg_is_refcounted(reg) ||
                                 reg_may_point_to_spin_lock(reg))) {
@@ -5839,6 +5859,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
        case PTR_TO_SOCKET_OR_NULL:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+       case PTR_TO_TCP_SOCK_OR_NULL:
                /* Only valid matches are exact, which memcmp() above
                 * would have accepted
                 */
@@ -6161,6 +6183,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
        case PTR_TO_SOCKET_OR_NULL:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+       case PTR_TO_TCP_SOCK_OR_NULL:
                return false;
        default:
                return true;
@@ -7166,6 +7190,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                case PTR_TO_SOCK_COMMON:
                        convert_ctx_access = bpf_sock_convert_ctx_access;
                        break;
+               case PTR_TO_TCP_SOCK:
+                       convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+                       break;
                default:
                        continue;
                }
index c0d7b9ef279fd62989aac9466ae5e234ee55d6ec..353735575204e9f17decd14e27fea916d047d39d 100644 (file)
@@ -5315,6 +5315,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .arg5_type      = ARG_ANYTHING,
 };
 
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+                                 struct bpf_insn_access_aux *info)
+{
+       if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+               return false;
+
+       if (off % size != 0)
+               return false;
+
+       switch (off) {
+       case offsetof(struct bpf_tcp_sock, bytes_received):
+       case offsetof(struct bpf_tcp_sock, bytes_acked):
+               return size == sizeof(__u64);
+       default:
+               return size == sizeof(__u32);
+       }
+}
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+                                   const struct bpf_insn *si,
+                                   struct bpf_insn *insn_buf,
+                                   struct bpf_prog *prog, u32 *target_size)
+{
+       struct bpf_insn *insn = insn_buf;
+
+#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                 \
+       do {                                                            \
+               BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) >     \
+                            FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
+                                     si->dst_reg, si->src_reg,         \
+                                     offsetof(struct tcp_sock, FIELD)); \
+       } while (0)
+
+       CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
+                                      BPF_TCP_SOCK_GET_COMMON);
+
+       if (insn > insn_buf)
+               return insn - insn_buf;
+
+       switch (si->off) {
+       case offsetof(struct bpf_tcp_sock, rtt_min):
+               BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+                            sizeof(struct minmax));
+               BUILD_BUG_ON(sizeof(struct minmax) <
+                            sizeof(struct minmax_sample));
+
+               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+                                     offsetof(struct tcp_sock, rtt_min) +
+                                     offsetof(struct minmax_sample, v));
+               break;
+       }
+
+       return insn - insn_buf;
+}
+
+BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
+{
+       sk = sk_to_full_sk(sk);
+
+       if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+               return (unsigned long)sk;
+
+       return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_tcp_sock_proto = {
+       .func           = bpf_tcp_sock,
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_TCP_SOCK_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5470,6 +5543,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
+#ifdef CONFIG_INET
+       case BPF_FUNC_tcp_sock:
+               return &bpf_tcp_sock_proto;
+#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
@@ -5560,6 +5637,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
+       case BPF_FUNC_tcp_sock:
+               return &bpf_tcp_sock_proto;
 #endif
        default:
                return bpf_base_func_proto(func_id);