Commit 23d86c8e authored by Martin KaFai Lau's avatar Martin KaFai Lau
Browse files

Merge branch 'Use this_cpu_xxx for preemption-safety'

Hou Tao says:

====================

From: Hou Tao <houtao1@huawei.com>

Hi,

The patchset aims to make the update of per-cpu prog->active and per-cpu
bpf_task_storage_busy being preemption-safe. The problem is on same
architectures (e.g. arm64), __this_cpu_{inc|dec|inc_return} are neither
preemption-safe nor IRQ-safe, so under fully preemptible kernel the
concurrent updates on these per-cpu variables may be interleaved and the
final values of these variables may be not zero.

Patch 1 & 2 use the preemption-safe per-cpu helpers to manipulate
prog->active and bpf_task_storage_busy. Patch 3 & 4 add a test case in
map_tests to show the concurrent updates on the per-cpu
bpf_task_storage_busy by using __this_cpu_{inc|dec} are not atomic.

Comments are always welcome.

Regards,
Tao

Change Log:
v2:
* Patch 1: update commit message to indicate the problem is only
  possible for fully preemptible kernel
* Patch 2: a new patch which fixes the problem for prog->active
* Patch 3 & 4: move it to test_maps and make it depend on CONFIG_PREEMPT

v1: https://lore.kernel.org/bpf/20220829142752.330094-1-houtao@huaweicloud.com/


====================

Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parents c9ae8c96 73b97bc7
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
				struct bpf_local_storage_elem, map_node))) {
			if (busy_counter) {
				migrate_disable();
				__this_cpu_inc(*busy_counter);
				this_cpu_inc(*busy_counter);
			}
			bpf_selem_unlink(selem, false);
			if (busy_counter) {
				__this_cpu_dec(*busy_counter);
				this_cpu_dec(*busy_counter);
				migrate_enable();
			}
			cond_resched_rcu();
+4 −4
Original line number Diff line number Diff line
@@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
	migrate_disable();
	__this_cpu_inc(bpf_task_storage_busy);
	this_cpu_inc(bpf_task_storage_busy);
}

static void bpf_task_storage_unlock(void)
{
	__this_cpu_dec(bpf_task_storage_busy);
	this_cpu_dec(bpf_task_storage_busy);
	migrate_enable();
}

static bool bpf_task_storage_trylock(void)
{
	migrate_disable();
	if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
		__this_cpu_dec(bpf_task_storage_busy);
	if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
		this_cpu_dec(bpf_task_storage_busy);
		migrate_enable();
		return false;
	}
+4 −4
Original line number Diff line number Diff line
@@ -895,7 +895,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru

	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);

	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
		inc_misses_counter(prog);
		return 0;
	}
@@ -930,7 +930,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
	bpf_reset_run_ctx(run_ctx->saved_run_ctx);

	update_prog_stats(prog, start);
	__this_cpu_dec(*(prog->active));
	this_cpu_dec(*(prog->active));
	migrate_enable();
	rcu_read_unlock();
}
@@ -966,7 +966,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
	migrate_disable();
	might_fault();

	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
		inc_misses_counter(prog);
		return 0;
	}
@@ -982,7 +982,7 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
	bpf_reset_run_ctx(run_ctx->saved_run_ctx);

	update_prog_stats(prog, start);
	__this_cpu_dec(*(prog->active));
	this_cpu_dec(*(prog->active));
	migrate_enable();
	rcu_read_unlock_trace();
}
+122 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>

#include <bpf/bpf.h>
#include <bpf/libbpf.h>

#include "test_maps.h"
#include "task_local_storage_helpers.h"
#include "read_bpf_task_storage_busy.skel.h"

struct lookup_ctx {
	bool start;
	bool stop;
	int pid_fd;
	int map_fd;
	int loop;
};

static void *lookup_fn(void *arg)
{
	struct lookup_ctx *ctx = arg;
	long value;
	int i = 0;

	while (!ctx->start)
		usleep(1);

	while (!ctx->stop && i++ < ctx->loop)
		bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value);
	return NULL;
}

static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr)
{
	unsigned int i;

	ctx->stop = true;
	ctx->start = true;
	for (i = 0; i < nr; i++)
		pthread_join(tids[i], NULL);
}

void test_task_storage_map_stress_lookup(void)
{
#define MAX_NR_THREAD 4096
	unsigned int i, nr = 256, loop = 8192, cpu = 0;
	struct read_bpf_task_storage_busy *skel;
	pthread_t tids[MAX_NR_THREAD];
	struct lookup_ctx ctx;
	cpu_set_t old, new;
	const char *cfg;
	int err;

	cfg = getenv("TASK_STORAGE_MAP_NR_THREAD");
	if (cfg) {
		nr = atoi(cfg);
		if (nr > MAX_NR_THREAD)
			nr = MAX_NR_THREAD;
	}
	cfg = getenv("TASK_STORAGE_MAP_NR_LOOP");
	if (cfg)
		loop = atoi(cfg);
	cfg = getenv("TASK_STORAGE_MAP_PIN_CPU");
	if (cfg)
		cpu = atoi(cfg);

	skel = read_bpf_task_storage_busy__open_and_load();
	err = libbpf_get_error(skel);
	CHECK(err, "open_and_load", "error %d\n", err);

	/* Only for a fully preemptible kernel */
	if (!skel->kconfig->CONFIG_PREEMPT)
		return;

	/* Save the old affinity setting */
	sched_getaffinity(getpid(), sizeof(old), &old);

	/* Pinned on a specific CPU */
	CPU_ZERO(&new);
	CPU_SET(cpu, &new);
	sched_setaffinity(getpid(), sizeof(new), &new);

	ctx.start = false;
	ctx.stop = false;
	ctx.pid_fd = sys_pidfd_open(getpid(), 0);
	ctx.map_fd = bpf_map__fd(skel->maps.task);
	ctx.loop = loop;
	for (i = 0; i < nr; i++) {
		err = pthread_create(&tids[i], NULL, lookup_fn, &ctx);
		if (err) {
			abort_lookup(&ctx, tids, i);
			CHECK(err, "pthread_create", "error %d\n", err);
			goto out;
		}
	}

	ctx.start = true;
	for (i = 0; i < nr; i++)
		pthread_join(tids[i], NULL);

	skel->bss->pid = getpid();
	err = read_bpf_task_storage_busy__attach(skel);
	CHECK(err, "attach", "error %d\n", err);

	/* Trigger program */
	syscall(SYS_gettid);
	skel->bss->pid = 0;

	CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy);
out:
	read_bpf_task_storage_busy__destroy(skel);
	/* Restore affinity setting */
	sched_setaffinity(getpid(), sizeof(old), &old);
}
+1 −9
Original line number Diff line number Diff line
@@ -9,18 +9,10 @@

#include "bprm_opts.skel.h"
#include "network_helpers.h"

#ifndef __NR_pidfd_open
#define __NR_pidfd_open 434
#endif
#include "task_local_storage_helpers.h"

static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL };

static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
{
	return syscall(__NR_pidfd_open, pid, flags);
}

static int update_storage(int map_fd, int secureexec)
{
	int task_fd, ret = 0;
Loading