Merge branch 'Shared ownership for local kptrs' (7a0788fe) · Commits · EulixOS / Software / Kernel

include/linux/bpf.h

+55 −25

Original line number	Diff line number	Diff line
		@@ -187,6 +187,7 @@ enum btf_field_type {
		BPF_RB_NODE = (1 << 7),
		BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE \| BPF_LIST_HEAD \|
		BPF_RB_NODE \| BPF_RB_ROOT,
		BPF_REFCOUNT = (1 << 8),
		};

		typedef void (btf_dtor_kfunc_t)(void );
		@@ -210,6 +211,7 @@ struct btf_field_graph_root {

		struct btf_field {
		u32 offset;
		u32 size;
		enum btf_field_type type;
		union {
		struct btf_field_kptr kptr;
		@@ -222,15 +224,10 @@ struct btf_record {
		u32 field_mask;
		int spin_lock_off;
		int timer_off;
		int refcount_off;
		struct btf_field fields[];
		};

		struct btf_field_offs {
		u32 cnt;
		u32 field_off[BTF_FIELDS_MAX];
		u8 field_sz[BTF_FIELDS_MAX];
		};

		struct bpf_map {
		/* The first two cachelines with read-mostly members of which some
		* are also accessed in fast-path (e.g. ops, max_entries).
		@@ -257,7 +254,6 @@ struct bpf_map {
		struct obj_cgroup *objcg;
		#endif
		char name[BPF_OBJ_NAME_LEN];
		struct btf_field_offs *field_offs;
		/* The 3rd and 4th cacheline with misc members to avoid false sharing
		* particularly with refcounting.
		*/
		@@ -299,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
		return "bpf_rb_root";
		case BPF_RB_NODE:
		return "bpf_rb_node";
		case BPF_REFCOUNT:
		return "bpf_refcount";
		default:
		WARN_ON_ONCE(1);
		return "unknown";
		@@ -323,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
		return sizeof(struct bpf_rb_root);
		case BPF_RB_NODE:
		return sizeof(struct bpf_rb_node);
		case BPF_REFCOUNT:
		return sizeof(struct bpf_refcount);
		default:
		WARN_ON_ONCE(1);
		return 0;
		@@ -347,12 +347,42 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
		return __alignof__(struct bpf_rb_root);
		case BPF_RB_NODE:
		return __alignof__(struct bpf_rb_node);
		case BPF_REFCOUNT:
		return __alignof__(struct bpf_refcount);
		default:
		WARN_ON_ONCE(1);
		return 0;
		}
		}

		static inline void bpf_obj_init_field(const struct btf_field field, void addr)
		{
		memset(addr, 0, field->size);

		switch (field->type) {
		case BPF_REFCOUNT:
		refcount_set((refcount_t *)addr, 1);
		break;
		case BPF_RB_NODE:
		RB_CLEAR_NODE((struct rb_node *)addr);
		break;
		case BPF_LIST_HEAD:
		case BPF_LIST_NODE:
		INIT_LIST_HEAD((struct list_head *)addr);
		break;
		case BPF_RB_ROOT:
		/* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
		case BPF_SPIN_LOCK:
		case BPF_TIMER:
		case BPF_KPTR_UNREF:
		case BPF_KPTR_REF:
		break;
		default:
		WARN_ON_ONCE(1);
		return;
		}
		}

		static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
		{
		if (IS_ERR_OR_NULL(rec))
		@@ -360,14 +390,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
		return rec->field_mask & type;
		}

		static inline void bpf_obj_init(const struct btf_field_offs foffs, void obj)
		static inline void bpf_obj_init(const struct btf_record rec, void obj)
		{
		int i;

		if (!foffs)
		if (IS_ERR_OR_NULL(rec))
		return;
		for (i = 0; i < foffs->cnt; i++)
		memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]);
		for (i = 0; i < rec->cnt; i++)
		bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
		}

		/* 'dst' must be a temporary buffer and should not point to memory that is being
		@@ -379,7 +409,7 @@ static inline void bpf_obj_init(const struct btf_field_offs foffs, void obj)
		*/
		static inline void check_and_init_map_value(struct bpf_map map, void dst)
		{
		bpf_obj_init(map->field_offs, dst);
		bpf_obj_init(map->record, dst);
		}

		/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
		@@ -399,14 +429,14 @@ static inline void bpf_long_memcpy(void dst, const void src, u32 size)
		}

		/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
		static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
		static inline void bpf_obj_memcpy(struct btf_record *rec,
		void dst, void src, u32 size,
		bool long_memcpy)
		{
		u32 curr_off = 0;
		int i;

		if (likely(!foffs)) {
		if (IS_ERR_OR_NULL(rec)) {
		if (long_memcpy)
		bpf_long_memcpy(dst, src, round_up(size, 8));
		else
		@@ -414,49 +444,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
		return;
		}

		for (i = 0; i < foffs->cnt; i++) {
		u32 next_off = foffs->field_off[i];
		for (i = 0; i < rec->cnt; i++) {
		u32 next_off = rec->fields[i].offset;
		u32 sz = next_off - curr_off;

		memcpy(dst + curr_off, src + curr_off, sz);
		curr_off += foffs->field_sz[i] + sz;
		curr_off += rec->fields[i].size + sz;
		}
		memcpy(dst + curr_off, src + curr_off, size - curr_off);
		}

		static inline void copy_map_value(struct bpf_map map, void dst, void *src)
		{
		bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false);
		bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
		}

		static inline void copy_map_value_long(struct bpf_map map, void dst, void *src)
		{
		bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true);
		bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
		}

		static inline void bpf_obj_memzero(struct btf_field_offs foffs, void dst, u32 size)
		static inline void bpf_obj_memzero(struct btf_record rec, void dst, u32 size)
		{
		u32 curr_off = 0;
		int i;

		if (likely(!foffs)) {
		if (IS_ERR_OR_NULL(rec)) {
		memset(dst, 0, size);
		return;
		}

		for (i = 0; i < foffs->cnt; i++) {
		u32 next_off = foffs->field_off[i];
		for (i = 0; i < rec->cnt; i++) {
		u32 next_off = rec->fields[i].offset;
		u32 sz = next_off - curr_off;

		memset(dst + curr_off, 0, sz);
		curr_off += foffs->field_sz[i] + sz;
		curr_off += rec->fields[i].size + sz;
		}
		memset(dst + curr_off, 0, size - curr_off);
		}

		static inline void zero_map_value(struct bpf_map map, void dst)
		{
		bpf_obj_memzero(map->field_offs, dst, map->value_size);
		bpf_obj_memzero(map->record, dst, map->value_size);
		}

		void copy_map_value_locked(struct bpf_map map, void dst, void *src,

include/linux/bpf_verifier.h

+6 −1

Original line number	Diff line number	Diff line
		@@ -464,7 +464,12 @@ struct bpf_insn_aux_data {
		*/
		struct bpf_loop_inline_state loop_inline_state;
		};
		u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */
		union {
		/* remember the size of type passed to bpf_obj_new to rewrite R1 */
		u64 obj_new_size;
		/* remember the offset of node field within type to rewrite */
		u64 insert_off;
		};
		struct btf_struct_meta *kptr_struct_meta;
		u64 map_key_state; /* constant (32 bit) key tracking for maps */
		int ctx_field_size; /* the ctx field size for load insn, maybe 0 */

include/linux/btf.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc {
		struct btf_struct_meta {
		u32 btf_id;
		struct btf_record *record;
		struct btf_field_offs *field_offs;
		};

		struct btf_struct_metas {
		@@ -207,7 +206,6 @@ int btf_find_timer(const struct btf btf, const struct btf_type t);
		struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type *t,
		u32 field_mask, u32 value_size);
		int btf_check_and_fixup_fields(const struct btf btf, struct btf_record rec);
		struct btf_field_offs btf_parse_field_offs(struct btf_record rec);
		bool btf_type_is_void(const struct btf_type *t);
		s32 btf_find_by_name_kind(const struct btf btf, const char name, u8 kind);
		const struct btf_type btf_type_skip_modifiers(const struct btf btf,

include/uapi/linux/bpf.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -6985,6 +6985,10 @@ struct bpf_rb_node {
		__u64 :64;
		} __attribute__((aligned(8)));

		struct bpf_refcount {
		__u32 :32;
		} __attribute__((aligned(4)));

		struct bpf_sysctl {
		__u32 write; /* Sysctl is being read (= 0) or written (= 1).
		* Allows 1,2,4-byte read, but no write.

kernel/bpf/btf.c

+34 −92

Original line number	Diff line number	Diff line
		@@ -1666,10 +1666,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)

		if (!tab)
		return;
		for (i = 0; i < tab->cnt; i++) {
		for (i = 0; i < tab->cnt; i++)
		btf_record_free(tab->types[i].record);
		kfree(tab->types[i].field_offs);
		}
		kfree(tab);
		}

		@@ -3393,6 +3391,7 @@ static int btf_get_field_type(const char name, u32 field_mask, u32 seen_mask,
		field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
		field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
		field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
		field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");

		/* Only return BPF_KPTR when all other types with matchable names fail */
		if (field_mask & BPF_KPTR) {
		@@ -3441,6 +3440,7 @@ static int btf_find_struct_field(const struct btf *btf,
		case BPF_TIMER:
		case BPF_LIST_NODE:
		case BPF_RB_NODE:
		case BPF_REFCOUNT:
		ret = btf_find_struct(btf, member_type, off, sz, field_type,
		idx < info_cnt ? &info[idx] : &tmp);
		if (ret < 0)
		@@ -3506,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf btf, const struct btf_type t,
		case BPF_TIMER:
		case BPF_LIST_NODE:
		case BPF_RB_NODE:
		case BPF_REFCOUNT:
		ret = btf_find_struct(btf, var_type, off, sz, field_type,
		idx < info_cnt ? &info[idx] : &tmp);
		if (ret < 0)
		@@ -3700,12 +3701,24 @@ static int btf_parse_rb_root(const struct btf btf, struct btf_field field,
		__alignof__(struct bpf_rb_node));
		}

		static int btf_field_cmp(const void _a, const void _b, const void *priv)
		{
		const struct btf_field a = (const struct btf_field )_a;
		const struct btf_field b = (const struct btf_field )_b;

		if (a->offset < b->offset)
		return -1;
		else if (a->offset > b->offset)
		return 1;
		return 0;
		}

		struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type *t,
		u32 field_mask, u32 value_size)
		{
		struct btf_field_info info_arr[BTF_FIELDS_MAX];
		u32 next_off = 0, field_type_size;
		struct btf_record *rec;
		u32 next_off = 0;
		int ret, i, cnt;

		ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
		@@ -3724,8 +3737,10 @@ struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type

		rec->spin_lock_off = -EINVAL;
		rec->timer_off = -EINVAL;
		rec->refcount_off = -EINVAL;
		for (i = 0; i < cnt; i++) {
		if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
		field_type_size = btf_field_type_size(info_arr[i].type);
		if (info_arr[i].off + field_type_size > value_size) {
		WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
		ret = -EFAULT;
		goto end;
		@@ -3734,11 +3749,12 @@ struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type
		ret = -EEXIST;
		goto end;
		}
		next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
		next_off = info_arr[i].off + field_type_size;

		rec->field_mask \|= info_arr[i].type;
		rec->fields[i].offset = info_arr[i].off;
		rec->fields[i].type = info_arr[i].type;
		rec->fields[i].size = field_type_size;

		switch (info_arr[i].type) {
		case BPF_SPIN_LOCK:
		@@ -3751,6 +3767,11 @@ struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type
		/* Cache offset for faster lookup at runtime */
		rec->timer_off = rec->fields[i].offset;
		break;
		case BPF_REFCOUNT:
		WARN_ON_ONCE(rec->refcount_off >= 0);
		/* Cache offset for faster lookup at runtime */
		rec->refcount_off = rec->fields[i].offset;
		break;
		case BPF_KPTR_UNREF:
		case BPF_KPTR_REF:
		ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
		@@ -3784,30 +3805,16 @@ struct btf_record btf_parse_fields(const struct btf btf, const struct btf_type
		goto end;
		}

		/* need collection identity for non-owning refs before allowing this
		*
		* Consider a node type w/ both list and rb_node fields:
		* struct node {
		* struct bpf_list_node l;
		* struct bpf_rb_node r;
		* }
		*
		* Used like so:
		* struct node *n = bpf_obj_new(....);
		* bpf_list_push_front(&list_head, &n->l);
		* bpf_rbtree_remove(&rb_root, &n->r);
		*
		* It should not be possible to rbtree_remove the node since it hasn't
		* been added to a tree. But push_front converts n to a non-owning
		* reference, and rbtree_remove accepts the non-owning reference to
		* a type w/ bpf_rb_node field.
		*/
		if (btf_record_has_field(rec, BPF_LIST_NODE) &&
		if (rec->refcount_off < 0 &&
		btf_record_has_field(rec, BPF_LIST_NODE) &&
		btf_record_has_field(rec, BPF_RB_NODE)) {
		ret = -EINVAL;
		goto end;
		}

		sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
		NULL, rec);

		return rec;
		end:
		btf_record_free(rec);
		@@ -3889,61 +3896,6 @@ int btf_check_and_fixup_fields(const struct btf btf, struct btf_record rec)
		return 0;
		}

		static int btf_field_offs_cmp(const void _a, const void _b, const void *priv)
		{
		const u32 a = (const u32 )_a;
		const u32 b = (const u32 )_b;

		if (a < b)
		return -1;
		else if (a > b)
		return 1;
		return 0;
		}

		static void btf_field_offs_swap(void _a, void _b, int size, const void *priv)
		{
		struct btf_field_offs foffs = (void )priv;
		u32 *off_base = foffs->field_off;
		u32 a = _a, b = _b;
		u8 sz_a, sz_b;

		sz_a = foffs->field_sz + (a - off_base);
		sz_b = foffs->field_sz + (b - off_base);

		swap(a, b);
		swap(sz_a, sz_b);
		}

		struct btf_field_offs btf_parse_field_offs(struct btf_record rec)
		{
		struct btf_field_offs *foffs;
		u32 i, *off;
		u8 *sz;

		BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
		if (IS_ERR_OR_NULL(rec))
		return NULL;

		foffs = kzalloc(sizeof(*foffs), GFP_KERNEL \| __GFP_NOWARN);
		if (!foffs)
		return ERR_PTR(-ENOMEM);

		off = foffs->field_off;
		sz = foffs->field_sz;
		for (i = 0; i < rec->cnt; i++) {
		off[i] = rec->fields[i].offset;
		sz[i] = btf_field_type_size(rec->fields[i].type);
		}
		foffs->cnt = rec->cnt;

		if (foffs->cnt == 1)
		return foffs;
		sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
		btf_field_offs_cmp, btf_field_offs_swap, foffs);
		return foffs;
		}

		static void __btf_struct_show(const struct btf btf, const struct btf_type t,
		u32 type_id, void *data, u8 bits_offset,
		struct btf_show *show)
		@@ -5348,6 +5300,7 @@ static const char *alloc_obj_fields[] = {
		"bpf_list_node",
		"bpf_rb_root",
		"bpf_rb_node",
		"bpf_refcount",
		};

		static struct btf_struct_metas *
		@@ -5386,7 +5339,6 @@ btf_parse_struct_metas(struct bpf_verifier_log log, struct btf btf)
		for (i = 1; i < n; i++) {
		struct btf_struct_metas *new_tab;
		const struct btf_member *member;
		struct btf_field_offs *foffs;
		struct btf_struct_meta *type;
		struct btf_record *record;
		const struct btf_type *t;
		@@ -5422,23 +5374,13 @@ btf_parse_struct_metas(struct bpf_verifier_log log, struct btf btf)
		type = &tab->types[tab->cnt];
		type->btf_id = i;
		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK \| BPF_LIST_HEAD \| BPF_LIST_NODE \|
		BPF_RB_ROOT \| BPF_RB_NODE, t->size);
		BPF_RB_ROOT \| BPF_RB_NODE \| BPF_REFCOUNT, t->size);
		/* The record cannot be unset, treat it as an error if so */
		if (IS_ERR_OR_NULL(record)) {
		ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
		goto free;
		}
		foffs = btf_parse_field_offs(record);
		/* We need the field_offs to be valid for a valid record,
		* either both should be set or both should be unset.
		*/
		if (IS_ERR_OR_NULL(foffs)) {
		btf_record_free(record);
		ret = -EFAULT;
		goto free;
		}
		type->record = record;
		type->field_offs = foffs;
		tab->cnt++;
		}
		return tab;