Merge branch 'Add support for kptrs in more BPF maps' (6c18e375) · Commits · EulixOS / Software / Kernel

include/linux/bpf_local_storage.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -74,6 +74,12 @@ struct bpf_local_storage_elem {
		struct hlist_node snode; /* Linked to bpf_local_storage */
		struct bpf_local_storage __rcu *local_storage;
		struct rcu_head rcu;
		bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
		* callbacks? bpf_local_storage_map_free only
		* executes rcu_barrier when there are special
		* fields, this field remembers that to ensure we
		* don't access already freed smap in sdata.
		*/
		/* 8 bytes hole */
		/* The data is stored in another cacheline to minimize
		* the number of cachelines access during a cache hit.

kernel/bpf/bpf_local_storage.c

+44 −4

Original line number	Diff line number	Diff line
		@@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map smap, void owner,
		if (selem) {
		if (value)
		copy_map_value(&smap->map, SDATA(selem)->data, value);
		/* No need to call check_and_init_map_value as memory is zero init */
		return selem;
		}

		@@ -113,10 +114,25 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
		struct bpf_local_storage_elem *selem;

		selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
		if (rcu_trace_implies_rcu_gp())
		/* The can_use_smap bool is set whenever we need to free additional
		* fields in selem data before freeing selem. bpf_local_storage_map_free
		* only executes rcu_barrier to wait for RCU callbacks when it has
		* special fields, hence we can only conditionally dereference smap, as
		* by this time the map might have already been freed without waiting
		* for our call_rcu callback if it did not have any special fields.
		*/
		if (selem->can_use_smap)
		bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
		kfree(selem);
		}

		static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
		{
		/* Free directly if Tasks Trace RCU GP also implies RCU GP */
		if (rcu_trace_implies_rcu_gp())
		bpf_selem_free_rcu(rcu);
		else
		kfree_rcu(selem, rcu);
		call_rcu(rcu, bpf_selem_free_rcu);
		}

		/* local_storage->lock must be held and selem->local_storage == local_storage.
		@@ -170,9 +186,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);

		if (use_trace_rcu)
		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
		else
		kfree_rcu(selem, rcu);
		call_rcu(&selem->rcu, bpf_selem_free_rcu);

		return free_local_storage;
		}
		@@ -240,6 +256,11 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
		hlist_add_head_rcu(&selem->map_node, &b->list);
		raw_spin_unlock_irqrestore(&b->lock, flags);

		/* If our data will have special fields, smap will wait for us to use
		* its record in bpf_selem_free_* RCU callbacks before freeing itself.
		*/
		selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
		}

		void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
		@@ -723,6 +744,25 @@ void bpf_local_storage_map_free(struct bpf_map *map,
		*/
		synchronize_rcu();

		/* Only delay freeing of smap, buckets are not needed anymore */
		kvfree(smap->buckets);

		/* When local storage has special fields, callbacks for
		* bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
		* the map BTF record, we need to execute an RCU barrier to wait for
		* them as the record will be freed right after our map_free callback.
		*/
		if (!IS_ERR_OR_NULL(smap->map.record)) {
		rcu_barrier_tasks_trace();
		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
		* is true, because while call_rcu invocation is skipped in that
		* case in bpf_selem_free_tasks_trace_rcu (and all local storage
		* maps pass use_trace_rcu = true), there can be call_rcu
		* callbacks based on use_trace_rcu = false in the earlier while
		* ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
		* called from owner's free path.
		*/
		rcu_barrier();
		}
		bpf_map_area_free(smap);
		}

kernel/bpf/hashtab.c

+37 −22

Original line number	Diff line number	Diff line
		@@ -249,9 +249,20 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
		struct htab_elem *elem;

		elem = get_htab_elem(htab, i);
		if (htab_is_percpu(htab)) {
		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
		int cpu;

		for_each_possible_cpu(cpu) {
		bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
		cond_resched();
		}
		} else {
		bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
		cond_resched();
		}
		cond_resched();
		}
		}

		static void htab_free_elems(struct bpf_htab *htab)
		@@ -759,10 +770,18 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
		static void check_and_free_fields(struct bpf_htab *htab,
		struct htab_elem *elem)
		{
		if (htab_is_percpu(htab)) {
		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
		int cpu;

		for_each_possible_cpu(cpu)
		bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
		} else {
		void *map_value = elem->key + round_up(htab->map.key_size, 8);

		bpf_obj_free_fields(htab->map.record, map_value);
		}
		}

		/* It is called from the bpf_lru_list when the LRU needs to delete
		* older elements from the htab.
		@@ -858,9 +877,9 @@ static int htab_map_get_next_key(struct bpf_map map, void key, void *next_key)

		static void htab_elem_free(struct bpf_htab htab, struct htab_elem l)
		{
		check_and_free_fields(htab, l);
		if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
		check_and_free_fields(htab, l);
		bpf_mem_cache_free(&htab->ma, l);
		}

		@@ -918,14 +937,13 @@ static void pcpu_copy_value(struct bpf_htab htab, void __percpu pptr,
		{
		if (!onallcpus) {
		/* copy true value_size bytes */
		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
		copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
		} else {
		u32 size = round_up(htab->map.value_size, 8);
		int off = 0, cpu;

		for_each_possible_cpu(cpu) {
		bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
		value + off, size);
		copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
		off += size;
		}
		}
		@@ -940,16 +958,14 @@ static void pcpu_init_value(struct bpf_htab htab, void __percpu pptr,
		* (onallcpus=false always when coming from bpf prog).
		*/
		if (!onallcpus) {
		u32 size = round_up(htab->map.value_size, 8);
		int current_cpu = raw_smp_processor_id();
		int cpu;

		for_each_possible_cpu(cpu) {
		if (cpu == current_cpu)
		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
		size);
		else
		memset(per_cpu_ptr(pptr, cpu), 0, size);
		copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
		else /* Since elem is preallocated, we cannot touch special fields */
		zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
		}
		} else {
		pcpu_copy_value(htab, pptr, value, onallcpus);
		@@ -1575,9 +1591,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map map, void key,

		pptr = htab_elem_get_ptr(l, key_size);
		for_each_possible_cpu(cpu) {
		bpf_long_memcpy(value + off,
		per_cpu_ptr(pptr, cpu),
		roundup_value_size);
		copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
		check_and_init_map_value(&htab->map, value + off);
		off += roundup_value_size;
		}
		} else {
		@@ -1772,8 +1787,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,

		pptr = htab_elem_get_ptr(l, map->key_size);
		for_each_possible_cpu(cpu) {
		bpf_long_memcpy(dst_val + off,
		per_cpu_ptr(pptr, cpu), size);
		copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
		check_and_init_map_value(&htab->map, dst_val + off);
		off += size;
		}
		} else {
		@@ -2046,9 +2061,9 @@ static int __bpf_hash_map_seq_show(struct seq_file seq, struct htab_elem elem)
		roundup_value_size = round_up(map->value_size, 8);
		pptr = htab_elem_get_ptr(elem, map->key_size);
		for_each_possible_cpu(cpu) {
		bpf_long_memcpy(info->percpu_value_buf + off,
		per_cpu_ptr(pptr, cpu),
		roundup_value_size);
		copy_map_value_long(map, info->percpu_value_buf + off,
		per_cpu_ptr(pptr, cpu));
		check_and_init_map_value(map, info->percpu_value_buf + off);
		off += roundup_value_size;
		}
		ctx.value = info->percpu_value_buf;
		@@ -2292,8 +2307,8 @@ int bpf_percpu_hash_copy(struct bpf_map map, void key, void *value)
		*/
		pptr = htab_elem_get_ptr(l, map->key_size);
		for_each_possible_cpu(cpu) {
		bpf_long_memcpy(value + off,
		per_cpu_ptr(pptr, cpu), size);
		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
		check_and_init_map_value(map, value + off);
		off += size;
		}
		ret = 0;

kernel/bpf/syscall.c

+7 −1

Original line number	Diff line number	Diff line
		@@ -1059,9 +1059,15 @@ static int map_check_btf(struct bpf_map map, const struct btf btf,
		case BPF_KPTR_UNREF:
		case BPF_KPTR_REF:
		if (map->map_type != BPF_MAP_TYPE_HASH &&
		map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
		map->map_type != BPF_MAP_TYPE_LRU_HASH &&
		map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
		map->map_type != BPF_MAP_TYPE_ARRAY &&
		map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
		map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
		map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
		map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
		map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
		map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
		ret = -EOPNOTSUPP;
		goto free_map_tab;
		}

kernel/bpf/verifier.c

+8 −4

Original line number	Diff line number	Diff line
		@@ -7222,22 +7222,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
		break;
		case BPF_MAP_TYPE_SK_STORAGE:
		if (func_id != BPF_FUNC_sk_storage_get &&
		func_id != BPF_FUNC_sk_storage_delete)
		func_id != BPF_FUNC_sk_storage_delete &&
		func_id != BPF_FUNC_kptr_xchg)
		goto error;
		break;
		case BPF_MAP_TYPE_INODE_STORAGE:
		if (func_id != BPF_FUNC_inode_storage_get &&
		func_id != BPF_FUNC_inode_storage_delete)
		func_id != BPF_FUNC_inode_storage_delete &&
		func_id != BPF_FUNC_kptr_xchg)
		goto error;
		break;
		case BPF_MAP_TYPE_TASK_STORAGE:
		if (func_id != BPF_FUNC_task_storage_get &&
		func_id != BPF_FUNC_task_storage_delete)
		func_id != BPF_FUNC_task_storage_delete &&
		func_id != BPF_FUNC_kptr_xchg)
		goto error;
		break;
		case BPF_MAP_TYPE_CGRP_STORAGE:
		if (func_id != BPF_FUNC_cgrp_storage_get &&
		func_id != BPF_FUNC_cgrp_storage_delete)
		func_id != BPF_FUNC_cgrp_storage_delete &&
		func_id != BPF_FUNC_kptr_xchg)
		goto error;
		break;
		case BPF_MAP_TYPE_BLOOM_FILTER: