Commit 23e403b3 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'BPF open-coded iterators'



Andrii Nakryiko says:

====================

Add support for open-coded (aka inline) iterators in BPF world. This is a next
evolution of gradually allowing more powerful and less restrictive looping and
iteration capabilities to BPF programs.

We set up a framework for implementing all kinds of iterators (e.g., cgroup,
task, file, etc, iterators), but this patch set only implements numbers
iterator, which is used to implement ergonomic bpf_for() for-like construct
(see patches #4-#5). We also add bpf_for_each(), which is a generic
foreach-like construct that will work with any kind of open-coded iterator
implementation, as long as we stick with bpf_iter_<type>_{new,next,destroy}()
naming pattern (which we now enforce on the kernel side).

Patch #1 is preparatory refactoring for easier way to check for special kfunc
calls. Patch #2 is adding iterator kfunc registration and validation logic,
which is mostly independent from the rest of open-coded iterator logic, so is
separated out for easier reviewing.

The meat of verifier-side logic is in patch #3. Patch #4 implements numbers
iterator. I kept them separate to have clean reference for how to integrate
new iterator types (now even simpler to do than in v1 of this patch set).
Patch #5 adds bpf_for(), bpf_for_each(), and bpf_repeat() macros to
bpf_misc.h, and also adds yet another pyperf test variant, now with bpf_for()
loop. Patch #6 is verification tests, based on numbers iterator (as the only
available right now). Patch #7 actually tests runtime behavior of numbers
iterator.

Finally, with changes in v2, it's possible and trivial to implement custom
iterators completely in kernel modules, which we showcase and test by adding
a simple iterator returning same number a given number of times to
bpf_testmod. Patch #8 is where all this happens and is tested.

Most of the relevant details are in corresponding commit messages or code
comments.

v4->v5:
  - fixing missed inner for() in is_iter_reg_valid_uninit, and fixed return
    false (kernel test robot);
  - typo fixes and comment/commit description improvements throughout the
    patch set;
v3->v4:
  - remove unused variable from is_iter_reg_valid_init (kernel test robot);
v2->v3:
  - remove special kfunc leftovers for bpf_iter_num_{new,next,destroy};
  - add iters/testmod_seq* to DENYLIST.s390x, it doesn't support kfuncs in
    modules yet (CI);
v1->v2:
  - rebased on latest, dropping previously landed preparatory patches;
  - each iterator type now have its own `struct bpf_iter_<type>` which allows
    each iterator implementation to use exactly as much stack space as
    necessary, allowing to avoid runtime allocations (Alexei);
  - reworked how iterator kfuncs are defined, no verifier changes are required
    when adding new iterator type;
  - added bpf_testmod-based iterator implementation;
  - address the rest of feedback, comments, commit message adjustment, etc.

Cc: Tejun Heo <tj@kernel.org>
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents ed69e066 7e86a8c4
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -1617,8 +1617,12 @@ struct bpf_array {
#define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33

/* Maximum number of loops for bpf_loop */
#define BPF_MAX_LOOPS	BIT(23)
/* Maximum number of loops for bpf_loop and bpf_iter_num.
 * It's enum to expose it (and thus make it discoverable) through BTF.
 */
enum {
	BPF_MAX_LOOPS = 8 * 1024 * 1024,
};

#define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
				 BPF_F_RDONLY_PROG |	\
+25 −0
Original line number Diff line number Diff line
@@ -59,6 +59,14 @@ struct bpf_active_lock {
	u32 id;
};

#define ITER_PREFIX "bpf_iter_"

enum bpf_iter_state {
	BPF_ITER_STATE_INVALID, /* for non-first slot */
	BPF_ITER_STATE_ACTIVE,
	BPF_ITER_STATE_DRAINED,
};

struct bpf_reg_state {
	/* Ordering of fields matters.  See states_equal() */
	enum bpf_reg_type type;
@@ -103,6 +111,18 @@ struct bpf_reg_state {
			bool first_slot;
		} dynptr;

		/* For bpf_iter stack slots */
		struct {
			/* BTF container and BTF type ID describing
			 * struct bpf_iter_<type> of an iterator state
			 */
			struct btf *btf;
			u32 btf_id;
			/* packing following two fields to fit iter state into 16 bytes */
			enum bpf_iter_state state:2;
			int depth:30;
		} iter;

		/* Max size from any of the above. */
		struct {
			unsigned long raw1;
@@ -141,6 +161,8 @@ struct bpf_reg_state {
	 * same reference to the socket, to determine proper reference freeing.
	 * For stack slots that are dynptrs, this is used to track references to
	 * the dynptr to determine proper reference freeing.
	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
	 * to a specific instance of bpf_iter.
	 */
	u32 id;
	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -211,9 +233,11 @@ enum bpf_stack_slot_type {
	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
	 */
	STACK_DYNPTR,
	STACK_ITER,
};

#define BPF_REG_SIZE 8	/* size of eBPF register in bytes */

#define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
#define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)

@@ -448,6 +472,7 @@ struct bpf_insn_aux_data {
	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
	bool zext_dst; /* this insn zero extends dst reg */
	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
	u8 alu_state; /* used in combination with alu_limit */

	/* below fields are initialized once */
+4 −0
Original line number Diff line number Diff line
@@ -71,6 +71,10 @@
#define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
#define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */

/*
 * Tag marking a kernel function as a kfunc. This is meant to minimize the
+8 −0
Original line number Diff line number Diff line
@@ -7112,4 +7112,12 @@ enum {
	BPF_F_TIMER_ABS = (1ULL << 0),
};

/* BPF numbers iterator state */
struct bpf_iter_num {
	/* opaque iterator state; having __u64 here allows to preserve correct
	 * alignment requirements in vmlinux.h, generated from BTF
	 */
	__u64 __opaque[1];
} __attribute__((aligned(8)));

#endif /* _UAPI__LINUX_BPF_H__ */
+70 −0
Original line number Diff line number Diff line
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
	.arg4_type	= ARG_ANYTHING,
};

struct bpf_iter_num_kern {
	int cur; /* current value, inclusive */
	int end; /* final value, exclusive */
} __aligned(8);

__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
		  "Global functions as their definitions will be in vmlinux BTF");

__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
{
	struct bpf_iter_num_kern *s = (void *)it;

	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));

	BTF_TYPE_EMIT(struct btf_iter_num);

	/* start == end is legit, it's an empty range and we'll just get NULL
	 * on first (and any subsequent) bpf_iter_num_next() call
	 */
	if (start > end) {
		s->cur = s->end = 0;
		return -EINVAL;
	}

	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
		s->cur = s->end = 0;
		return -E2BIG;
	}

	/* user will call bpf_iter_num_next() first,
	 * which will set s->cur to exactly start value;
	 * underflow shouldn't matter
	 */
	s->cur = start - 1;
	s->end = end;

	return 0;
}

__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
{
	struct bpf_iter_num_kern *s = (void *)it;

	/* check failed initialization or if we are done (same behavior);
	 * need to be careful about overflow, so convert to s64 for checks,
	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
	 * s->cur + 1 >= s->end
	 */
	if ((s64)(s->cur + 1) >= s->end) {
		s->cur = s->end = 0;
		return NULL;
	}

	s->cur++;

	return &s->cur;
}

__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
{
	struct bpf_iter_num_kern *s = (void *)it;

	s->cur = s->end = 0;
}

__diag_pop();
Loading