Commit 90065c06 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'fix-bpf_get_stack-with-PEBS'

Song Liu says:

====================
Calling get_perf_callchain() on perf_events from PEBS entries may cause
unwinder errors. To fix this issue, perf subsystem fetches callchain early,
and marks perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY.
Similar issue exists when BPF program calls get_perf_callchain() via
helper functions. For more information about this issue, please refer to
discussions in [1].

This set fixes this issue with helper proto bpf_get_stackid_pe and
bpf_get_stack_pe.

[1] https://lore.kernel.org/lkml/ED7B9430-6489-4260-B3C5-9CFA2E3AA87A@fb.com/



Changes v4 => v5:
1. Return -EPROTO instead of -EINVAL on PERF_EVENT_IOC_SET_BPF errors.
   (Alexei)
2. Let libbpf print a hint message when PERF_EVENT_IOC_SET_BPF returns
   -EPROTO. (Alexei)

Changes v3 => v4:
1. Fix error check logic in bpf_get_stackid_pe and bpf_get_stack_pe.
   (Alexei)
2. Do not allow attaching BPF programs with bpf_get_stack|stackid to
   perf_event with precise_ip > 0, but not proper callchain. (Alexei)
3. Add selftest get_stackid_cannot_attach.

Changes v2 => v3:
1. Fix handling of stackmap skip field. (Andrii)
2. Simplify the code in a few places. (Andrii)

Changes v1 => v2:
1. Simplify the design and avoid introducing new helper function. (Andrii)
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 909e446b 346938e9
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+2 −1
Original line number Diff line number Diff line
@@ -533,7 +533,8 @@ struct bpf_prog {
				is_func:1,	/* program is a bpf function */
				kprobe_override:1, /* Do we override a kprobe? */
				has_callchain_buf:1, /* callchain buffer allocated? */
				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
				call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
	enum bpf_prog_type	type;		/* Type of BPF program */
	enum bpf_attach_type	expected_attach_type; /* For some prog types */
	u32			len;		/* Number of filter blocks */
+166 −18
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include <linux/elf.h>
@@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
#endif
}

BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
	   u64, flags)
static long __bpf_get_stackid(struct bpf_map *map,
			      struct perf_callchain_entry *trace, u64 flags)
{
	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
	struct perf_callchain_entry *trace;
	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
	u32 max_depth = map->value_size / stack_map_data_size(map);
	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
@@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
	u32 hash, id, trace_nr, trace_len;
	bool user = flags & BPF_F_USER_STACK;
	bool kernel = !user;
	u64 *ips;
	bool hash_matches;

	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
		return -EINVAL;

	trace = get_perf_callchain(regs, init_nr, kernel, user,
				   sysctl_perf_event_max_stack, false, false);

	if (unlikely(!trace))
		/* couldn't fetch the stack trace */
		return -EFAULT;

	/* get_perf_callchain() guarantees that trace->nr >= init_nr
	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
	 */
@@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
	return id;
}

BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
	   u64, flags)
{
	u32 max_depth = map->value_size / stack_map_data_size(map);
	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
	bool user = flags & BPF_F_USER_STACK;
	struct perf_callchain_entry *trace;
	bool kernel = !user;

	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
		return -EINVAL;

	trace = get_perf_callchain(regs, init_nr, kernel, user,
				   sysctl_perf_event_max_stack, false, false);

	if (unlikely(!trace))
		/* couldn't fetch the stack trace */
		return -EFAULT;

	return __bpf_get_stackid(map, trace, flags);
}

const struct bpf_func_proto bpf_get_stackid_proto = {
	.func		= bpf_get_stackid,
	.gpl_only	= true,
@@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
	.arg3_type	= ARG_ANYTHING,
};

static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
{
	__u64 nr_kernel = 0;

	while (nr_kernel < trace->nr) {
		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
			break;
		nr_kernel++;
	}
	return nr_kernel;
}

BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
	   struct bpf_map *, map, u64, flags)
{
	struct perf_event *event = ctx->event;
	struct perf_callchain_entry *trace;
	bool kernel, user;
	__u64 nr_kernel;
	int ret;

	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
		return bpf_get_stackid((unsigned long)(ctx->regs),
				       (unsigned long) map, flags, 0, 0);

	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
		return -EINVAL;

	user = flags & BPF_F_USER_STACK;
	kernel = !user;

	trace = ctx->data->callchain;
	if (unlikely(!trace))
		return -EFAULT;

	nr_kernel = count_kernel_ip(trace);

	if (kernel) {
		__u64 nr = trace->nr;

		trace->nr = nr_kernel;
		ret = __bpf_get_stackid(map, trace, flags);

		/* restore nr */
		trace->nr = nr;
	} else { /* user */
		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;

		skip += nr_kernel;
		if (skip > BPF_F_SKIP_FIELD_MASK)
			return -EFAULT;

		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
		ret = __bpf_get_stackid(map, trace, flags);
	}
	return ret;
}

const struct bpf_func_proto bpf_get_stackid_proto_pe = {
	.func		= bpf_get_stackid_pe,
	.gpl_only	= false,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
			    struct perf_callchain_entry *trace_in,
			    void *buf, u32 size, u64 flags)
{
	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
@@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
	else
		init_nr = sysctl_perf_event_max_stack - num_elem;

	if (kernel && task)
	if (trace_in)
		trace = trace_in;
	else if (kernel && task)
		trace = get_callchain_entry_for_task(task, init_nr);
	else
		trace = get_perf_callchain(regs, init_nr, kernel, user,
@@ -556,7 +640,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
	   u64, flags)
{
	return __bpf_get_stack(regs, NULL, buf, size, flags);
	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
}

const struct bpf_func_proto bpf_get_stack_proto = {
@@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
{
	struct pt_regs *regs = task_pt_regs(task);

	return __bpf_get_stack(regs, task, buf, size, flags);
	return __bpf_get_stack(regs, task, NULL, buf, size, flags);
}

BTF_ID_LIST(bpf_get_task_stack_btf_ids)
@@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
	.btf_id		= bpf_get_task_stack_btf_ids,
};

BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
	   void *, buf, u32, size, u64, flags)
{
	struct perf_event *event = ctx->event;
	struct perf_callchain_entry *trace;
	bool kernel, user;
	int err = -EINVAL;
	__u64 nr_kernel;

	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
		return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);

	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
			       BPF_F_USER_BUILD_ID)))
		goto clear;

	user = flags & BPF_F_USER_STACK;
	kernel = !user;

	err = -EFAULT;
	trace = ctx->data->callchain;
	if (unlikely(!trace))
		goto clear;

	nr_kernel = count_kernel_ip(trace);

	if (kernel) {
		__u64 nr = trace->nr;

		trace->nr = nr_kernel;
		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
				      size, flags);

		/* restore nr */
		trace->nr = nr;
	} else { /* user */
		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;

		skip += nr_kernel;
		if (skip > BPF_F_SKIP_FIELD_MASK)
			goto clear;

		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
				      size, flags);
	}
	return err;

clear:
	memset(buf, 0, size);
	return err;

}

const struct bpf_func_proto bpf_get_stack_proto_pe = {
	.func		= bpf_get_stack_pe,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg4_type	= ARG_ANYTHING,
};

/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
+3 −0
Original line number Diff line number Diff line
@@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
		env->prog->has_callchain_buf = true;
	}

	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
		env->prog->call_get_stack = true;

	if (changes_data)
		clear_all_pkt_pointers(env);
	return 0;
+18 −0
Original line number Diff line number Diff line
@@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
	if (IS_ERR(prog))
		return PTR_ERR(prog);

	if (event->attr.precise_ip &&
	    prog->call_get_stack &&
	    (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
	     event->attr.exclude_callchain_kernel ||
	     event->attr.exclude_callchain_user)) {
		/*
		 * On perf_event with precise_ip, calling bpf_get_stack()
		 * may trigger unwinder warnings and occasional crashes.
		 * bpf_get_[stack|stackid] works around this issue by using
		 * callchain attached to perf_sample_data. If the
		 * perf_event does not full (kernel and user) callchain
		 * attached to perf_sample_data, do not allow attaching BPF
		 * program that calls bpf_get_[stack|stackid].
		 */
		bpf_prog_put(prog);
		return -EPROTO;
	}

	event->prog = prog;
	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
Loading