Commit e384c7b7 authored by Kui-Feng Lee's avatar Kui-Feng Lee Committed by Andrii Nakryiko
Browse files

bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack



BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on
stacks and set/reset the current bpf_run_ctx before/after calling a
bpf_prog.

Signed-off-by: default avatarKui-Feng Lee <kuifeng@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Acked-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com
parent f7e0beaf
Loading
Loading
Loading
Loading
+32 −9
Original line number Diff line number Diff line
@@ -1763,14 +1763,30 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,

static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
			   struct bpf_tramp_link *l, int stack_size,
			   bool save_ret)
			   int run_ctx_off, bool save_ret)
{
	u8 *prog = *pprog;
	u8 *jmp_insn;
	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
	struct bpf_prog *p = l->link.prog;

	/* mov rdi, 0 */
	emit_mov_imm64(&prog, BPF_REG_1, 0, 0);

	/* Prepare struct bpf_tramp_run_ctx.
	 *
	 * bpf_tramp_run_ctx is already preserved by
	 * arch_prepare_bpf_trampoline().
	 *
	 * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi
	 */
	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);

	/* arg1: mov rdi, progs[i] */
	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
	/* arg2: lea rsi, [rbp - ctx_cookie_off] */
	EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);

	if (emit_call(&prog,
		      p->aux->sleepable ? __bpf_prog_enter_sleepable :
		      __bpf_prog_enter, prog))
@@ -1816,6 +1832,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
	/* arg2: mov rsi, rbx <- start time in nsec */
	emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
	/* arg3: lea rdx, [rbp - run_ctx_off] */
	EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
	if (emit_call(&prog,
		      p->aux->sleepable ? __bpf_prog_exit_sleepable :
		      __bpf_prog_exit, prog))
@@ -1853,14 +1871,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)

static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
		      struct bpf_tramp_links *tl, int stack_size,
		      bool save_ret)
		      int run_ctx_off, bool save_ret)
{
	int i;
	u8 *prog = *pprog;

	for (i = 0; i < tl->nr_links; i++) {
		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
				    save_ret))
				    run_ctx_off, save_ret))
			return -EINVAL;
	}
	*pprog = prog;
@@ -1869,7 +1887,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,

static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
			      struct bpf_tramp_links *tl, int stack_size,
			      u8 **branches)
			      int run_ctx_off, u8 **branches)
{
	u8 *prog = *pprog;
	int i;
@@ -1880,7 +1898,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
	emit_mov_imm32(&prog, false, BPF_REG_0, 0);
	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
	for (i = 0; i < tl->nr_links; i++) {
		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true))
		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
			return -EINVAL;

		/* mod_ret prog stored return value into [rbp - 8]. Emit:
@@ -1986,7 +2004,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
				void *orig_call)
{
	int ret, i, nr_args = m->nr_args;
	int regs_off, ip_off, args_off, stack_size = nr_args * 8;
	int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -2016,6 +2034,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
	 * RBP - args_off  [ args count      ]  always
	 *
	 * RBP - ip_off    [ traced function ]  BPF_TRAMP_F_IP_ARG flag
	 *
	 * RBP - run_ctx_off [ bpf_tramp_run_ctx ]
	 */

	/* room for return value of orig_call or fentry prog */
@@ -2034,6 +2054,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i

	ip_off = stack_size;

	stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
	run_ctx_off = stack_size;

	if (flags & BPF_TRAMP_F_SKIP_FRAME) {
		/* skip patched call instruction and point orig_call to actual
		 * body of the kernel function.
@@ -2081,7 +2104,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
	}

	if (fentry->nr_links)
		if (invoke_bpf(m, &prog, fentry, regs_off,
		if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
			       flags & BPF_TRAMP_F_RET_FENTRY_RET))
			return -EINVAL;

@@ -2092,7 +2115,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
			return -ENOMEM;

		if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
				       branches)) {
				       run_ctx_off, branches)) {
			ret = -EINVAL;
			goto cleanup;
		}
@@ -2129,7 +2152,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
	}

	if (fexit->nr_links)
		if (invoke_bpf(m, &prog, fexit, regs_off, false)) {
		if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
			ret = -EINVAL;
			goto cleanup;
		}
+13 −4
Original line number Diff line number Diff line
@@ -730,6 +730,8 @@ struct bpf_tramp_links {
	int nr_links;
};

struct bpf_tramp_run_ctx;

/* Different use cases for BPF trampoline:
 * 1. replace nop at the function entry (kprobe equivalent)
 *    flags = BPF_TRAMP_F_RESTORE_REGS
@@ -756,10 +758,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i
				struct bpf_tramp_links *tlinks,
				void *orig_call);
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(struct bpf_prog *prog);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx);
u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
				       struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);

@@ -1351,6 +1354,12 @@ struct bpf_trace_run_ctx {
	u64 bpf_cookie;
};

struct bpf_tramp_run_ctx {
	struct bpf_run_ctx run_ctx;
	u64 bpf_cookie;
	struct bpf_run_ctx *saved_run_ctx;
};

static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
{
	struct bpf_run_ctx *old_ctx = NULL;
+5 −2
Original line number Diff line number Diff line
@@ -5020,6 +5020,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
	struct bpf_prog * __maybe_unused prog;
	struct bpf_tramp_run_ctx __maybe_unused run_ctx;

	switch (cmd) {
	case BPF_MAP_CREATE:
@@ -5047,13 +5048,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
			return -EINVAL;
		}

		if (!__bpf_prog_enter_sleepable(prog)) {
		run_ctx.bpf_cookie = 0;
		run_ctx.saved_run_ctx = NULL;
		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
			/* recursion detected */
			bpf_prog_put(prog);
			return -EBUSY;
		}
		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
		bpf_prog_put(prog);
		return 0;
#endif
+16 −4
Original line number Diff line number Diff line
@@ -568,11 +568,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
 * [2..MAX_U64] - execute bpf prog and record execution time.
 *     This is start time.
 */
u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
	__acquires(RCU)
{
	rcu_read_lock();
	migrate_disable();

	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);

	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
		inc_misses_counter(prog);
		return 0;
@@ -602,29 +605,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
	}
}

void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
	__releases(RCU)
{
	bpf_reset_run_ctx(run_ctx->saved_run_ctx);

	update_prog_stats(prog, start);
	__this_cpu_dec(*(prog->active));
	migrate_enable();
	rcu_read_unlock();
}

u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
{
	rcu_read_lock_trace();
	migrate_disable();
	might_fault();

	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
		inc_misses_counter(prog);
		return 0;
	}

	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);

	return bpf_prog_start_time();
}

void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
				       struct bpf_tramp_run_ctx *run_ctx)
{
	bpf_reset_run_ctx(run_ctx->saved_run_ctx);

	update_prog_stats(prog, start);
	__this_cpu_dec(*(prog->active));
	migrate_enable();