Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next (b8af417e) · Commits · EulixOS / Software / Kernel

Documentation/bpf/bpf_design_QA.rst

+6 −0

Original line number	Diff line number	Diff line
		@@ -208,6 +208,12 @@ data structures and compile with kernel internal headers. Both of these
		kernel internals are subject to change and can break with newer kernels
		such that the program needs to be adapted accordingly.

		Q: Are tracepoints part of the stable ABI?
		------------------------------------------
		A: NO. Tracepoints are tied to internal implementation details hence they are
		subject to change and can break with newer kernels. BPF programs need to change
		accordingly when this happens.

		Q: How much stack space a BPF program uses?
		-------------------------------------------
		A: Currently all program types are limited to 512 bytes of stack

Documentation/bpf/bpf_devel_QA.rst

+7 −4

Original line number	Diff line number	Diff line
		@@ -501,16 +501,19 @@ All LLVM releases can be found at: http://releases.llvm.org/

		Q: Got it, so how do I build LLVM manually anyway?
		--------------------------------------------------
		A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
		that set up, proceed with building the latest LLVM and clang version
		A: We recommend that developers who want the fastest incremental builds
		use the Ninja build system, you can find it in your system's package
		manager, usually the package is ninja or ninja-build.

		You need ninja, cmake and gcc-c++ as build requisites for LLVM. Once you
		have that set up, proceed with building the latest LLVM and clang version
		from the git repositories::

		$ git clone https://github.com/llvm/llvm-project.git
		$ mkdir -p llvm-project/llvm/build/install
		$ mkdir -p llvm-project/llvm/build
		$ cd llvm-project/llvm/build
		$ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
		-DLLVM_ENABLE_PROJECTS="clang" \
		-DBUILD_SHARED_LIBS=OFF \
		-DCMAKE_BUILD_TYPE=Release \
		-DLLVM_BUILD_RUNTIME=OFF
		$ ninja

Documentation/networking/filter.rst

+15 −13

Original line number	Diff line number	Diff line
		@@ -1048,12 +1048,12 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations::
		Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW.

		It also includes atomic operations, which use the immediate field for extra
		encoding.
		encoding::

		.imm = BPF_ADD, .code = BPF_ATOMIC \| BPF_W \| BPF_STX: lock xadd (u32 )(dst_reg + off16) += src_reg
		.imm = BPF_ADD, .code = BPF_ATOMIC \| BPF_DW \| BPF_STX: lock xadd (u64 )(dst_reg + off16) += src_reg

		The basic atomic operations supported are:
		The basic atomic operations supported are::

		BPF_ADD
		BPF_AND
		@@ -1066,33 +1066,35 @@ memory location addresed by ``dst_reg + off`` is atomically modified, with
		immediate, then these operations also overwrite ``src_reg`` with the
		value that was in memory before it was modified.

		The more special operations are:
		The more special operations are::

		BPF_XCHG

		This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg +
		off``.
		off``. ::

		BPF_CMPXCHG

		This atomically compares the value addressed by ``dst_reg + off`` with
		``R0``. If they match it is replaced with ``src_reg``, The value that was there
		before is loaded back to ``R0``.
		``R0``. If they match it is replaced with ``src_reg``. In either case, the
		value that was there before is zero-extended and loaded back to ``R0``.

		Note that 1 and 2 byte atomic operations are not supported.

		Except ``BPF_ADD`` _without_ ``BPF_FETCH`` (for legacy reasons), all 4 byte
		atomic operations require alu32 mode. Clang enables this mode by default in
		architecture v3 (``-mcpu=v3``). For older versions it can be enabled with
		Clang can generate atomic instructions by default when ``-mcpu=v3`` is
		enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
		Clang can generate is ``BPF_ADD`` without ``BPF_FETCH``. If you need to enable
		the atomics features, while keeping a lower ``-mcpu`` version, you can use
		``-Xclang -target-feature -Xclang +alu32``.

		You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to
		the exclusive-add operation encoded when the immediate field is zero.
		You may encounter ``BPF_XADD`` - this is a legacy name for ``BPF_ATOMIC``,
		referring to the exclusive-add operation encoded when the immediate field is
		zero.

		eBPF has one 16-byte instruction: BPF_LD \| BPF_DW \| BPF_IMM which consists
		eBPF has one 16-byte instruction: ``BPF_LD \| BPF_DW \| BPF_IMM`` which consists
		of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single
		instruction that loads 64-bit immediate value into a dst_reg.
		Classic BPF has similar instruction: BPF_LD \| BPF_W \| BPF_IMM which loads
		Classic BPF has similar instruction: ``BPF_LD \| BPF_W \| BPF_IMM`` which loads
		32-bit immediate value into a register.

		eBPF verifier

Makefile

+12 −1

Original line number	Diff line number	Diff line
		@@ -1082,6 +1082,17 @@ ifdef CONFIG_STACK_VALIDATION
		endif
		endif

		PHONY += resolve_btfids_clean

		resolve_btfids_O = $(abspath $(objtree))/tools/bpf/resolve_btfids

		# tools/bpf/resolve_btfids directory might not exist
		# in output directory, skip its clean in that case
		resolve_btfids_clean:
		ifneq ($(wildcard $(resolve_btfids_O)),)
		$(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
		endif

		ifdef CONFIG_BPF
		ifdef CONFIG_DEBUG_INFO_BTF
		ifeq ($(has_libelf),1)
		@@ -1491,7 +1502,7 @@ vmlinuxclean:
		$(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean
		$(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean)

		clean: archclean vmlinuxclean
		clean: archclean vmlinuxclean resolve_btfids_clean

		# mrproper - Delete all generated files, including .config
		#

arch/x86/net/bpf_jit_comp.c

+158 −47

Original line number	Diff line number	Diff line
		@@ -869,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
		}
		}

		static int emit_nops(u8 **pprog, int len)
		{
		u8 prog = pprog;
		int i, noplen, cnt = 0;

		while (len > 0) {
		noplen = len;

		if (noplen > ASM_NOP_MAX)
		noplen = ASM_NOP_MAX;

		for (i = 0; i < noplen; i++)
		EMIT1(ideal_nops[noplen][i]);
		len -= noplen;
		}

		*pprog = prog;

		return cnt;
		}

		#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))

		static int do_jit(struct bpf_prog bpf_prog, int addrs, u8 *image,
		int oldproglen, struct jit_context *ctx)
		int oldproglen, struct jit_context *ctx, bool jmp_padding)
		{
		bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
		struct bpf_insn *insn = bpf_prog->insnsi;
		@@ -880,7 +903,7 @@ static int do_jit(struct bpf_prog bpf_prog, int addrs, u8 *image,
		bool seen_exit = false;
		u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
		int i, cnt = 0, excnt = 0;
		int proglen = 0;
		int ilen, proglen = 0;
		u8 *prog = temp;
		int err;

		@@ -894,17 +917,24 @@ static int do_jit(struct bpf_prog bpf_prog, int addrs, u8 *image,
		bpf_prog_was_classic(bpf_prog), tail_call_reachable,
		bpf_prog->aux->func_idx != 0);
		push_callee_regs(&prog, callee_regs_used);
		addrs[0] = prog - temp;

		ilen = prog - temp;
		if (image)
		memcpy(image + proglen, temp, ilen);
		proglen += ilen;
		addrs[0] = proglen;
		prog = temp;

		for (i = 1; i <= insn_cnt; i++, insn++) {
		const s32 imm32 = insn->imm;
		u32 dst_reg = insn->dst_reg;
		u32 src_reg = insn->src_reg;
		u8 b2 = 0, b3 = 0;
		u8 *start_of_ldx;
		s64 jmp_offset;
		u8 jmp_cond;
		int ilen;
		u8 *func;
		int nops;

		switch (insn->code) {
		/* ALU */
		@@ -1249,12 +1279,30 @@ st: if (is_imm8(insn->off))
		case BPF_LDX \| BPF_PROBE_MEM \| BPF_W:
		case BPF_LDX \| BPF_MEM \| BPF_DW:
		case BPF_LDX \| BPF_PROBE_MEM \| BPF_DW:
		if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
		/* test src_reg, src_reg */
		maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
		EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
		/* jne start_of_ldx */
		EMIT2(X86_JNE, 0);
		/* xor dst_reg, dst_reg */
		emit_mov_imm32(&prog, false, dst_reg, 0);
		/* jmp byte_after_ldx */
		EMIT2(0xEB, 0);

		/* populate jmp_offset for JNE above */
		temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
		start_of_ldx = prog;
		}
		emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
		if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
		struct exception_table_entry *ex;
		u8 *_insn = image + proglen;
		s64 delta;

		/* populate jmp_offset for JMP above */
		start_of_ldx[-1] = prog - start_of_ldx;

		if (!bpf_prog->aux->extable)
		break;

		@@ -1502,6 +1550,30 @@ st: if (is_imm8(insn->off))
		}
		jmp_offset = addrs[i + insn->off] - addrs[i];
		if (is_imm8(jmp_offset)) {
		if (jmp_padding) {
		/* To keep the jmp_offset valid, the extra bytes are
		* padded before the jump insn, so we substract the
		* 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
		*
		* If the previous pass already emits an imm8
		* jmp_cond, then this BPF insn won't shrink, so
		* "nops" is 0.
		*
		* On the other hand, if the previous pass emits an
		* imm32 jmp_cond, the extra 4 bytes(*) is padded to
		* keep the image from shrinking further.
		*
		* (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
		* is 2 bytes, so the size difference is 4 bytes.
		*/
		nops = INSN_SZ_DIFF - 2;
		if (nops != 0 && nops != 4) {
		pr_err("unexpected jmp_cond padding: %d bytes\n",
		nops);
		return -EFAULT;
		}
		cnt += emit_nops(&prog, nops);
		}
		EMIT2(jmp_cond, jmp_offset);
		} else if (is_simm32(jmp_offset)) {
		EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
		@@ -1524,11 +1596,55 @@ st: if (is_imm8(insn->off))
		else
		jmp_offset = addrs[i + insn->off] - addrs[i];

		if (!jmp_offset)
		/* Optimize out nop jumps */
		if (!jmp_offset) {
		/*
		* If jmp_padding is enabled, the extra nops will
		* be inserted. Otherwise, optimize out nop jumps.
		*/
		if (jmp_padding) {
		/* There are 3 possible conditions.
		* (1) This BPF_JA is already optimized out in
		* the previous run, so there is no need
		* to pad any extra byte (0 byte).
		* (2) The previous pass emits an imm8 jmp,
		* so we pad 2 bytes to match the previous
		* insn size.
		* (3) Similarly, the previous pass emits an
		* imm32 jmp, and 5 bytes is padded.
		*/
		nops = INSN_SZ_DIFF;
		if (nops != 0 && nops != 2 && nops != 5) {
		pr_err("unexpected nop jump padding: %d bytes\n",
		nops);
		return -EFAULT;
		}
		cnt += emit_nops(&prog, nops);
		}
		break;
		}
		emit_jmp:
		if (is_imm8(jmp_offset)) {
		if (jmp_padding) {
		/* To avoid breaking jmp_offset, the extra bytes
		* are padded before the actual jmp insn, so
		* 2 bytes is substracted from INSN_SZ_DIFF.
		*
		* If the previous pass already emits an imm8
		* jmp, there is nothing to pad (0 byte).
		*
		* If it emits an imm32 jmp (5 bytes) previously
		* and now an imm8 jmp (2 bytes), then we pad
		* (5 - 2 = 3) bytes to stop the image from
		* shrinking further.
		*/
		nops = INSN_SZ_DIFF - 2;
		if (nops != 0 && nops != 3) {
		pr_err("unexpected jump padding: %d bytes\n",
		nops);
		return -EFAULT;
		}
		cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
		}
		EMIT2(0xEB, jmp_offset);
		} else if (is_simm32(jmp_offset)) {
		EMIT1_off32(0xE9, jmp_offset);
		@@ -1624,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model m, u8 *pprog,
		struct bpf_prog *p, int stack_size, bool mod_ret)
		{
		u8 prog = pprog;
		u8 *jmp_insn;
		int cnt = 0;

		if (p->aux->sleepable) {
		if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
		return -EINVAL;
		} else {
		if (emit_call(&prog, __bpf_prog_enter, prog))
		/* arg1: mov rdi, progs[i] */
		emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
		if (emit_call(&prog,
		p->aux->sleepable ? __bpf_prog_enter_sleepable :
		__bpf_prog_enter, prog))
		return -EINVAL;
		/* remember prog start time returned by __bpf_prog_enter */
		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
		}

		/* if (__bpf_prog_enter*(prog) == 0)
		* goto skip_exec_of_prog;
		*/
		EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
		/* emit 2 nops that will be replaced with JE insn */
		jmp_insn = prog;
		emit_nops(&prog, 2);

		/* arg1: lea rdi, [rbp - stack_size] */
		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
		@@ -1654,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model m, u8 *pprog,
		if (mod_ret)
		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);

		if (p->aux->sleepable) {
		if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
		return -EINVAL;
		} else {
		/* replace 2 nops with JE insn, since jmp target is known */
		jmp_insn[0] = X86_JE;
		jmp_insn[1] = prog - jmp_insn - 2;

		/* arg1: mov rdi, progs[i] */
		emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
		(u32) (long) p);
		emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
		/* arg2: mov rsi, rbx <- start time in nsec */
		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
		if (emit_call(&prog, __bpf_prog_exit, prog))
		if (emit_call(&prog,
		p->aux->sleepable ? __bpf_prog_exit_sleepable :
		__bpf_prog_exit, prog))
		return -EINVAL;
		}

		*pprog = prog;
		return 0;
		}

		static void emit_nops(u8 **pprog, unsigned int len)
		{
		unsigned int i, noplen;
		u8 prog = pprog;
		int cnt = 0;

		while (len > 0) {
		noplen = len;

		if (noplen > ASM_NOP_MAX)
		noplen = ASM_NOP_MAX;

		for (i = 0; i < noplen; i++)
		EMIT1(ideal_nops[noplen][i]);
		len -= noplen;
		}

		*pprog = prog;
		}

		static void emit_align(u8 **pprog, u32 align)
		{
		u8 target, prog = *pprog;
		@@ -2065,6 +2169,9 @@ struct x64_jit_data {
		struct jit_context ctx;
		};

		#define MAX_PASSES 20
		#define PADDING_PASSES (MAX_PASSES - 5)

		struct bpf_prog bpf_int_jit_compile(struct bpf_prog prog)
		{
		struct bpf_binary_header *header = NULL;
		@@ -2074,6 +2181,7 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog prog)
		struct jit_context ctx = {};
		bool tmp_blinded = false;
		bool extra_pass = false;
		bool padding = false;
		u8 *image = NULL;
		int *addrs;
		int pass;
		@@ -2110,6 +2218,7 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog prog)
		image = jit_data->image;
		header = jit_data->header;
		extra_pass = true;
		padding = true;
		goto skip_init_addrs;
		}
		addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
		@@ -2135,8 +2244,10 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog prog)
		* may converge on the last pass. In such case do one more
		* pass to emit the final image.
		*/
		for (pass = 0; pass < 20 \|\| image; pass++) {
		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
		for (pass = 0; pass < MAX_PASSES \|\| image; pass++) {
		if (!padding && pass >= PADDING_PASSES)
		padding = true;
		proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
		if (proglen <= 0) {
		out_image:
		image = NULL;