Commit 80123f0a authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf_prog_pack allocator'



Song Liu says:

====================

Changes v8 => v9:
1. Fix an error with multi function program, in 4/9.

Changes v7 => v8:
1. Rebase and fix conflicts.
2. Lock text_mutex for text_poke_copy. (Daniel)

Changes v6 => v7:
1. Redesign the interface between generic and arch logic, based on feedback
   from Alexei and Ilya.
2. Split 6/7 of v6 to 7/9 and 8/9 in v7, for cleaner logic.
3. Add bpf_arch_text_copy in 6/9.

Changes v5 => v6:
1. Make jit_hole_buffer 128 byte long. Only fill the first and last 128
   bytes of header with INT3. (Alexei)
2. Use kvmalloc for temporary buffer. (Alexei)
3. Rename tmp_header/tmp_image => rw_header/rw_image. Remove tmp_image from
   x64_jit_data. (Alexei)
4. Change fall back round_up_to in bpf_jit_binary_alloc_pack() from
   BPF_PROG_MAX_PACK_PROG_SIZE to PAGE_SIZE.

Changes v4 => v5:
1. Do not use atomic64 for bpf_jit_current. (Alexei)

Changes v3 => v4:
1. Rename text_poke_jit() => text_poke_copy(). (Peter)
2. Change comment style. (Peter)

Changes v2 => v3:
1. Fix tailcall.

Changes v1 => v2:
1. Use text_poke instead of writing through linear mapping. (Peter)
2. Avoid making changes to non-x86_64 code.

Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could also add significant
pressure to instruction TLB. High iTLB pressure usually causes slow down
for the whole system, which includes visible performance degradation for
production workloads.

This set tries to solve this problem with customized allocator that pack
multiple programs into a huge page.

Patches 1-6 prepare the work. Patch 7 contains key logic of bpf_prog_pack
allocator. Patch 8 contains bpf_jit_binary_pack_alloc logic on top of
bpf_prog_pack allocator. Patch 9 uses this allocator in x86_64 jit.
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 128dac5f 1022a549
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -159,6 +159,7 @@ config X86
	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
	select HAVE_ARCH_AUDITSYSCALL
	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
	select HAVE_ARCH_HUGE_VMALLOC		if HAVE_ARCH_HUGE_VMAP
	select HAVE_ARCH_JUMP_LABEL
	select HAVE_ARCH_JUMP_LABEL_RELATIVE
	select HAVE_ARCH_KASAN			if X86_64
+1 −0
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);

+34 −0
Original line number Diff line number Diff line
@@ -1102,6 +1102,40 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
	return __text_poke(addr, opcode, len);
}

/**
 * text_poke_copy - Copy instructions into (an unused part of) RX memory
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy, could be more than 2x PAGE_SIZE
 *
 * Not safe against concurrent execution; useful for JITs to dump
 * new code blocks into unused regions of RX memory. Can be used in
 * conjunction with synchronize_rcu_tasks() to wait for existing
 * execution to quiesce after having made sure no existing functions
 * pointers are live.
 */
void *text_poke_copy(void *addr, const void *opcode, size_t len)
{
	unsigned long start = (unsigned long)addr;
	size_t patched = 0;

	if (WARN_ON_ONCE(core_kernel_text(start)))
		return NULL;

	mutex_lock(&text_mutex);
	while (patched < len) {
		unsigned long ptr = start + patched;
		size_t s;

		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);

		__text_poke((void *)ptr, opcode + patched, s);
		patched += s;
	}
	mutex_unlock(&text_mutex);
	return addr;
}

static void do_sync_core(void *info)
{
	sync_core();
+38 −27
Original line number Diff line number Diff line
@@ -330,8 +330,7 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
}

static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
				void *old_addr, void *new_addr,
				const bool text_live)
				void *old_addr, void *new_addr)
{
	const u8 *nop_insn = x86_nops[5];
	u8 old_insn[X86_PATCH_SIZE];
@@ -365,10 +364,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		goto out;
	ret = 1;
	if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
		if (text_live)
		text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
		else
			memcpy(ip, new_insn, X86_PATCH_SIZE);
		ret = 0;
	}
out:
@@ -384,7 +380,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		/* BPF poking in modules is not supported */
		return -EINVAL;

	return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
	return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
}

#define EMIT_LFENCE()	EMIT3(0x0F, 0xAE, 0xE8)
@@ -558,24 +554,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
		mutex_lock(&array->aux->poke_mutex);
		target = array->ptrs[poke->tail_call.key];
		if (target) {
			/* Plain memcpy is used when image is not live yet
			 * and still not locked as read-only. Once poke
			 * location is active (poke->tailcall_target_stable),
			 * any parallel bpf_arch_text_poke() might occur
			 * still on the read-write image until we finally
			 * locked it as read-only. Both modifications on
			 * the given image are under text_mutex to avoid
			 * interference.
			 */
			ret = __bpf_arch_text_poke(poke->tailcall_target,
						   BPF_MOD_JUMP, NULL,
						   (u8 *)target->bpf_func +
						   poke->adj_off, false);
						   poke->adj_off);
			BUG_ON(ret < 0);
			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
						   BPF_MOD_JUMP,
						   (u8 *)poke->tailcall_target +
						   X86_PATCH_SIZE, NULL, false);
						   X86_PATCH_SIZE, NULL);
			BUG_ON(ret < 0);
		}
		WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -866,7 +853,7 @@ static void emit_nops(u8 **pprog, int len)

#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
	bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -893,8 +880,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
	push_callee_regs(&prog, callee_regs_used);

	ilen = prog - temp;
	if (image)
		memcpy(image + proglen, temp, ilen);
	if (rw_image)
		memcpy(rw_image + proglen, temp, ilen);
	proglen += ilen;
	addrs[0] = proglen;
	prog = temp;
@@ -1323,6 +1310,9 @@ st: if (is_imm8(insn->off))
					pr_err("extable->insn doesn't fit into 32-bit\n");
					return -EFAULT;
				}
				/* switch ex to rw buffer for writes */
				ex = (void *)rw_image + ((void *)ex - (void *)image);

				ex->insn = delta;

				ex->data = EX_TYPE_BPF;
@@ -1705,7 +1695,7 @@ st: if (is_imm8(insn->off))
				pr_err("bpf_jit: fatal error\n");
				return -EFAULT;
			}
			memcpy(image + proglen, temp, ilen);
			memcpy(rw_image + proglen, temp, ilen);
		}
		proglen += ilen;
		addrs[i] = proglen;
@@ -2246,6 +2236,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
}

struct x64_jit_data {
	struct bpf_binary_header *rw_header;
	struct bpf_binary_header *header;
	int *addrs;
	u8 *image;
@@ -2258,6 +2249,7 @@ struct x64_jit_data {

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
	struct bpf_binary_header *rw_header = NULL;
	struct bpf_binary_header *header = NULL;
	struct bpf_prog *tmp, *orig_prog = prog;
	struct x64_jit_data *jit_data;
@@ -2266,6 +2258,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
	bool tmp_blinded = false;
	bool extra_pass = false;
	bool padding = false;
	u8 *rw_image = NULL;
	u8 *image = NULL;
	int *addrs;
	int pass;
@@ -2301,6 +2294,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
		oldproglen = jit_data->proglen;
		image = jit_data->image;
		header = jit_data->header;
		rw_header = jit_data->rw_header;
		rw_image = (void *)rw_header + ((void *)image - (void *)header);
		extra_pass = true;
		padding = true;
		goto skip_init_addrs;
@@ -2331,12 +2326,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
	for (pass = 0; pass < MAX_PASSES || image; pass++) {
		if (!padding && pass >= PADDING_PASSES)
			padding = true;
		proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
		proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
		if (proglen <= 0) {
out_image:
			image = NULL;
			if (header)
				bpf_jit_binary_free(header);
				bpf_jit_binary_pack_free(header, rw_header);
			prog = orig_prog;
			goto out_addrs;
		}
@@ -2360,8 +2355,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
				sizeof(struct exception_table_entry);

			/* allocate module memory for x86 insns and extable */
			header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
						      &image, align, jit_fill_hole);
			header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
							   &image, align, &rw_header, &rw_image,
							   jit_fill_hole);
			if (!header) {
				prog = orig_prog;
				goto out_addrs;
@@ -2377,14 +2373,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)

	if (image) {
		if (!prog->is_func || extra_pass) {
			/*
			 * bpf_jit_binary_pack_finalize fails in two scenarios:
			 *   1) header is not pointing to proper module memory;
			 *   2) the arch doesn't support bpf_arch_text_copy().
			 *
			 * Both cases are serious bugs that we should not continue.
			 */
			BUG_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header));
			bpf_tail_call_direct_fixup(prog);
			bpf_jit_binary_lock_ro(header);
		} else {
			jit_data->addrs = addrs;
			jit_data->ctx = ctx;
			jit_data->proglen = proglen;
			jit_data->image = image;
			jit_data->header = header;
			jit_data->rw_header = rw_header;
		}
		prog->bpf_func = (void *)image;
		prog->jited = 1;
@@ -2412,3 +2416,10 @@ bool bpf_jit_supports_kfunc_call(void)
{
	return true;
}

void *bpf_arch_text_copy(void *dst, void *src, size_t len)
{
	if (text_poke_copy(dst, src, len) == NULL)
		return ERR_PTR(-EINVAL);
	return dst;
}
+5 −2
Original line number Diff line number Diff line
@@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
int bpf_jit_charge_modmem(u32 pages);
void bpf_jit_uncharge_modmem(u32 pages);
int bpf_jit_charge_modmem(u32 size);
void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
@@ -953,6 +953,7 @@ struct bpf_prog_aux {
	bool sleepable;
	bool tail_call_reachable;
	bool xdp_has_frags;
	bool use_bpf_prog_pack;
	struct hlist_node tramp_hlist;
	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
	const struct btf_type *attach_func_proto;
@@ -2362,6 +2363,8 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		       void *addr1, void *addr2);

void *bpf_arch_text_copy(void *dst, void *src, size_t len);

struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);

Loading