From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 1 Dec 2017 05:33:00 +0000 (-0800)
Subject: nfp: bpf: detect load/store sequences lowered from memory copy
X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=6bc7103c89bff2b53a159e03b74c8216c79bfef8;p=openwrt%2Fstaging%2Fblogic.git

nfp: bpf: detect load/store sequences lowered from memory copy

This patch add the optimization frontend, but adding a new eBPF IR scan
pass "nfp_bpf_opt_ldst_gather".

The pass will traverse the IR to recognize the load/store pairs sequences
that come from lowering of memory copy builtins.

The gathered memory copy information will be kept in the meta info
structure of the first load instruction in the sequence and will be
consumed by the optimization backend added in the previous patches.

NOTE: a sequence with cross memory access doesn't qualify this
optimization, i.e. if one load in the sequence will load from place that
has been written by previous store. This is because when we turn the
sequence into single CPP operation, we are reading all contents at once
into NFP transfer registers, then write them out as a whole. This is not
identical with what the original load/store sequence is doing.

Detecting cross memory access for two random pointers will be difficult,
fortunately under XDP/eBPF's restrictied runtime environment, the copy
normally happen among map, packet data and stack, they do not overlap with
each other.

And for cases supported by NFP, cross memory access will only happen on
PTR_TO_PACKET. Fortunately for this, there is ID information that we could
do accurate memory alias check.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 1b98ef239605..3419ad495962 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2352,12 +2352,249 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
 	}
 }
 
+/* load/store pair that forms memory copy sould look like the following:
+ *
+ *   ld_width R, [addr_src + offset_src]
+ *   st_width [addr_dest + offset_dest], R
+ *
+ * The destination register of load and source register of store should
+ * be the same, load and store should also perform at the same width.
+ * If either of addr_src or addr_dest is stack pointer, we don't do the
+ * CPP optimization as stack is modelled by registers on NFP.
+ */
+static bool
+curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
+		    struct nfp_insn_meta *st_meta)
+{
+	struct bpf_insn *ld = &ld_meta->insn;
+	struct bpf_insn *st = &st_meta->insn;
+
+	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
+		return false;
+
+	if (ld_meta->ptr.type != PTR_TO_PACKET)
+		return false;
+
+	if (st_meta->ptr.type != PTR_TO_PACKET)
+		return false;
+
+	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
+		return false;
+
+	if (ld->dst_reg != st->src_reg)
+		return false;
+
+	/* There is jump to the store insn in this pair. */
+	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
+		return false;
+
+	return true;
+}
+
+/* Currently, we only support chaining load/store pairs if:
+ *
+ *  - Their address base registers are the same.
+ *  - Their address offsets are in the same order.
+ *  - They operate at the same memory width.
+ *  - There is no jump into the middle of them.
+ */
+static bool
+curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
+			      struct nfp_insn_meta *st_meta,
+			      struct bpf_insn *prev_ld,
+			      struct bpf_insn *prev_st)
+{
+	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
+	struct bpf_insn *ld = &ld_meta->insn;
+	struct bpf_insn *st = &st_meta->insn;
+	s16 prev_ld_off, prev_st_off;
+
+	/* This pair is the start pair. */
+	if (!prev_ld)
+		return true;
+
+	prev_size = BPF_LDST_BYTES(prev_ld);
+	curr_size = BPF_LDST_BYTES(ld);
+	prev_ld_base = prev_ld->src_reg;
+	prev_st_base = prev_st->dst_reg;
+	prev_ld_dst = prev_ld->dst_reg;
+	prev_ld_off = prev_ld->off;
+	prev_st_off = prev_st->off;
+
+	if (ld->dst_reg != prev_ld_dst)
+		return false;
+
+	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
+		return false;
+
+	if (curr_size != prev_size)
+		return false;
+
+	/* There is jump to the head of this pair. */
+	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
+		return false;
+
+	/* Both in ascending order. */
+	if (prev_ld_off + prev_size == ld->off &&
+	    prev_st_off + prev_size == st->off)
+		return true;
+
+	/* Both in descending order. */
+	if (ld->off + curr_size == prev_ld_off &&
+	    st->off + curr_size == prev_st_off)
+		return true;
+
+	return false;
+}
+
+/* Return TRUE if cross memory access happens. Cross memory access means
+ * store area is overlapping with load area that a later load might load
+ * the value from previous store, for this case we can't treat the sequence
+ * as an memory copy.
+ */
+static bool
+cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
+		 struct nfp_insn_meta *head_st_meta)
+{
+	s16 head_ld_off, head_st_off, ld_off;
+
+	/* Different pointer types does not overlap. */
+	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
+		return false;
+
+	/* load and store are both PTR_TO_PACKET, check ID info.  */
+	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
+		return true;
+
+	/* Canonicalize the offsets. Turn all of them against the original
+	 * base register.
+	 */
+	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
+	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
+	ld_off = ld->off + head_ld_meta->ptr.off;
+
+	/* Ascending order cross. */
+	if (ld_off > head_ld_off &&
+	    head_ld_off < head_st_off && ld_off >= head_st_off)
+		return true;
+
+	/* Descending order cross. */
+	if (ld_off < head_ld_off &&
+	    head_ld_off > head_st_off && ld_off <= head_st_off)
+		return true;
+
+	return false;
+}
+
+/* This pass try to identify the following instructoin sequences.
+ *
+ *   load R, [regA + offA]
+ *   store [regB + offB], R
+ *   load R, [regA + offA + const_imm_A]
+ *   store [regB + offB + const_imm_A], R
+ *   load R, [regA + offA + 2 * const_imm_A]
+ *   store [regB + offB + 2 * const_imm_A], R
+ *   ...
+ *
+ * Above sequence is typically generated by compiler when lowering
+ * memcpy. NFP prefer using CPP instructions to accelerate it.
+ */
+static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *head_ld_meta = NULL;
+	struct nfp_insn_meta *head_st_meta = NULL;
+	struct nfp_insn_meta *meta1, *meta2;
+	struct bpf_insn *prev_ld = NULL;
+	struct bpf_insn *prev_st = NULL;
+	u8 count = 0;
+
+	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
+		struct bpf_insn *ld = &meta1->insn;
+		struct bpf_insn *st = &meta2->insn;
+
+		/* Reset record status if any of the following if true:
+		 *   - The current insn pair is not load/store.
+		 *   - The load/store pair doesn't chain with previous one.
+		 *   - The chained load/store pair crossed with previous pair.
+		 *   - The chained load/store pair has a total size of memory
+		 *     copy beyond 128 bytes which is the maximum length a
+		 *     single NFP CPP command can transfer.
+		 */
+		if (!curr_pair_is_memcpy(meta1, meta2) ||
+		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
+						   prev_st) ||
+		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
+						       head_st_meta) ||
+				      head_ld_meta->ldst_gather_len >= 128))) {
+			if (!count)
+				continue;
+
+			if (count > 1) {
+				s16 prev_ld_off = prev_ld->off;
+				s16 prev_st_off = prev_st->off;
+				s16 head_ld_off = head_ld_meta->insn.off;
+
+				if (prev_ld_off < head_ld_off) {
+					head_ld_meta->insn.off = prev_ld_off;
+					head_st_meta->insn.off = prev_st_off;
+					head_ld_meta->ldst_gather_len =
+						-head_ld_meta->ldst_gather_len;
+				}
+
+				head_ld_meta->paired_st = &head_st_meta->insn;
+				head_st_meta->skip = true;
+			} else {
+				head_ld_meta->ldst_gather_len = 0;
+			}
+
+			/* If the chain is ended by an load/store pair then this
+			 * could serve as the new head of the the next chain.
+			 */
+			if (curr_pair_is_memcpy(meta1, meta2)) {
+				head_ld_meta = meta1;
+				head_st_meta = meta2;
+				head_ld_meta->ldst_gather_len =
+					BPF_LDST_BYTES(ld);
+				meta1 = nfp_meta_next(meta1);
+				meta2 = nfp_meta_next(meta2);
+				prev_ld = ld;
+				prev_st = st;
+				count = 1;
+			} else {
+				head_ld_meta = NULL;
+				head_st_meta = NULL;
+				prev_ld = NULL;
+				prev_st = NULL;
+				count = 0;
+			}
+
+			continue;
+		}
+
+		if (!head_ld_meta) {
+			head_ld_meta = meta1;
+			head_st_meta = meta2;
+		} else {
+			meta1->skip = true;
+			meta2->skip = true;
+		}
+
+		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
+		meta1 = nfp_meta_next(meta1);
+		meta2 = nfp_meta_next(meta2);
+		prev_ld = ld;
+		prev_st = st;
+		count++;
+	}
+}
+
 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
 	nfp_bpf_opt_reg_init(nfp_prog);
 
 	nfp_bpf_opt_ld_mask(nfp_prog);
 	nfp_bpf_opt_ld_shift(nfp_prog);
+	nfp_bpf_opt_ldst_gather(nfp_prog);
 
 	return 0;
 }