Commit 3538a0fb authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'Use __GFP_ZERO in bpf memory allocator'

Hou Tao says:

====================

From: Hou Tao <houtao1@huawei.com>

Hi,

The patchset tries to fix the hard-up problem found when checking how htab
handles element reuse in bpf memory allocator. The immediate reuse of
freed elements will reinitialize special fields (e.g., bpf_spin_lock) in
htab map value and it may corrupt lookup procedure with BFP_F_LOCK flag
which acquires bpf-spin-lock during value copying, and lead to hard-lock
as shown in patch #2. Patch #1 fixes it by using __GFP_ZERO when allocating
the object from slab and the behavior is similar with the preallocated
hash-table case. Please see individual patches for more details. And comments
are always welcome.

Regards,

Change Log:
v1:
  * Use __GFP_ZERO instead of ctor to avoid retpoline overhead (from Alexei)
  * Add comments for check_and_init_map_value() (from Alexei)
  * split __GFP_ZERO patches out of the original patchset to unblock
    the development work of others.

RFC: https://lore.kernel.org/bpf/20221230041151.1231169-1-houtao@huaweicloud.com


====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents b2d9002e f88da2d4
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -363,6 +363,13 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
		memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]);
}

/* 'dst' must be a temporary buffer and should not point to memory that is being
 * used in parallel by a bpf program or bpf syscall, otherwise the access from
 * the bpf program or bpf syscall may be corrupted by the reinitialization,
 * leading to weird problems. Even 'dst' is newly-allocated from bpf memory
 * allocator, it is still possible for 'dst' to be used in parallel by a bpf
 * program or bpf syscall.
 */
static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
{
	bpf_obj_init(map->field_offs, dst);
+2 −2
Original line number Diff line number Diff line
@@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
			l_new = ERR_PTR(-ENOMEM);
			goto dec_count;
		}
		check_and_init_map_value(&htab->map,
					 l_new->key + round_up(key_size, 8));
	}

	memcpy(l_new->key, key, key_size);
@@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
			else
				copy_map_value(map, value, l->key +
					       roundup_key_size);
			/* Zeroing special fields in the temp buffer */
			check_and_init_map_value(map, value);
		}

@@ -1792,6 +1791,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
						      true);
			else
				copy_map_value(map, dst_val, value);
			/* Zeroing special fields in the temp buffer */
			check_and_init_map_value(map, dst_val);
		}
		if (do_delete) {
+1 −1
Original line number Diff line number Diff line
@@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node)
		return obj;
	}

	return kmalloc_node(c->unit_size, flags, node);
	return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
}

static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+101 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
#define _GNU_SOURCE
#include <sched.h>
#include <stdbool.h>
#include <test_progs.h>
#include "htab_reuse.skel.h"

struct htab_op_ctx {
	int fd;
	int loop;
	bool stop;
};

struct htab_val {
	unsigned int lock;
	unsigned int data;
};

static void *htab_lookup_fn(void *arg)
{
	struct htab_op_ctx *ctx = arg;
	int i = 0;

	while (i++ < ctx->loop && !ctx->stop) {
		struct htab_val value;
		unsigned int key;

		/* Use BPF_F_LOCK to use spin-lock in map value. */
		key = 7;
		bpf_map_lookup_elem_flags(ctx->fd, &key, &value, BPF_F_LOCK);
	}

	return NULL;
}

static void *htab_update_fn(void *arg)
{
	struct htab_op_ctx *ctx = arg;
	int i = 0;

	while (i++ < ctx->loop && !ctx->stop) {
		struct htab_val value;
		unsigned int key;

		key = 7;
		value.lock = 0;
		value.data = key;
		bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK);
		bpf_map_delete_elem(ctx->fd, &key);

		key = 24;
		value.lock = 0;
		value.data = key;
		bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK);
		bpf_map_delete_elem(ctx->fd, &key);
	}

	return NULL;
}

void test_htab_reuse(void)
{
	unsigned int i, wr_nr = 1, rd_nr = 4;
	pthread_t tids[wr_nr + rd_nr];
	struct htab_reuse *skel;
	struct htab_op_ctx ctx;
	int err;

	skel = htab_reuse__open_and_load();
	if (!ASSERT_OK_PTR(skel, "htab_reuse__open_and_load"))
		return;

	ctx.fd = bpf_map__fd(skel->maps.htab);
	ctx.loop = 500;
	ctx.stop = false;

	memset(tids, 0, sizeof(tids));
	for (i = 0; i < wr_nr; i++) {
		err = pthread_create(&tids[i], NULL, htab_update_fn, &ctx);
		if (!ASSERT_OK(err, "pthread_create")) {
			ctx.stop = true;
			goto reap;
		}
	}
	for (i = 0; i < rd_nr; i++) {
		err = pthread_create(&tids[i + wr_nr], NULL, htab_lookup_fn, &ctx);
		if (!ASSERT_OK(err, "pthread_create")) {
			ctx.stop = true;
			goto reap;
		}
	}

reap:
	for (i = 0; i < wr_nr + rd_nr; i++) {
		if (!tids[i])
			continue;
		pthread_join(tids[i], NULL);
	}
	htab_reuse__destroy(skel);
}
+19 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>

char _license[] SEC("license") = "GPL";

struct htab_val {
	struct bpf_spin_lock lock;
	unsigned int data;
};

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 64);
	__type(key, unsigned int);
	__type(value, struct htab_val);
	__uint(map_flags, BPF_F_NO_PREALLOC);
} htab SEC(".maps");