Commit d5e9aeda authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'Optimize performance of update hash-map when free is zero'

Feng zhou says:

====================
From: Feng Zhou <zhoufeng.zf@bytedance.com>

We encountered bad case on big system with 96 CPUs that
alloc_htab_elem() would last for 1ms. The reason is that after the
prealloc hashtab has no free elems, when trying to update, it will still
grab spin_locks of all cpus. If there are multiple update users, the
competition is very serious.

0001: Use head->first to check whether the free list is empty or not before taking
the lock.
0002: Add benchmark to reproduce this worst case.

Changelog:
v5->v6: Addressed comments from Alexei Starovoitov.
- Adjust the commit log.
some details in here:
https://lore.kernel.org/all/20220608021050.47279-1-zhoufeng.zf@bytedance.com/

v4->v5: Addressed comments from Alexei Starovoitov.
- Use head->first.
- Use cpu+max_entries.
some details in here:
https://lore.kernel.org/bpf/20220601084149.13097-1-zhoufeng.zf@bytedance.com/

v3->v4: Addressed comments from Daniel Borkmann.
- Use READ_ONCE/WRITE_ONCE.
some details in here:
https://lore.kernel.org/all/20220530091340.53443-1-zhoufeng.zf@bytedance.com/

v2->v3: Addressed comments from Alexei Starovoitov, Andrii Nakryiko.
- Adjust the way the benchmark is tested.
- Adjust the code format.
some details in here:
https://lore.kernel.org/all/20220524075306.32306-1-zhoufeng.zf@bytedance.com/T/

v1->v2: Addressed comments from Alexei Starovoitov.
- add a benchmark to reproduce the issue.
- Adjust the code format that avoid adding indent.
some details in here:
https://lore.kernel.org/all/877ac441-045b-1844-6938-fcaee5eee7f2@bytedance.com/T/


====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents fe928335 89eda984
Loading
Loading
Loading
Loading
+14 −6
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
					   struct pcpu_freelist_node *node)
{
	node->next = head->first;
	head->first = node;
	WRITE_ONCE(head->first, node);
}

static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
	orig_cpu = cpu = raw_smp_processor_id();
	while (1) {
		head = per_cpu_ptr(s->freelist, cpu);
		if (!READ_ONCE(head->first))
			goto next_cpu;
		raw_spin_lock(&head->lock);
		node = head->first;
		if (node) {
			head->first = node->next;
			WRITE_ONCE(head->first, node->next);
			raw_spin_unlock(&head->lock);
			return node;
		}
		raw_spin_unlock(&head->lock);
next_cpu:
		cpu = cpumask_next(cpu, cpu_possible_mask);
		if (cpu >= nr_cpu_ids)
			cpu = 0;
@@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
	}

	/* per cpu lists are all empty, try extralist */
	if (!READ_ONCE(s->extralist.first))
		return NULL;
	raw_spin_lock(&s->extralist.lock);
	node = s->extralist.first;
	if (node)
		s->extralist.first = node->next;
		WRITE_ONCE(s->extralist.first, node->next);
	raw_spin_unlock(&s->extralist.lock);
	return node;
}
@@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
	orig_cpu = cpu = raw_smp_processor_id();
	while (1) {
		head = per_cpu_ptr(s->freelist, cpu);
		if (!READ_ONCE(head->first))
			goto next_cpu;
		if (raw_spin_trylock(&head->lock)) {
			node = head->first;
			if (node) {
				head->first = node->next;
				WRITE_ONCE(head->first, node->next);
				raw_spin_unlock(&head->lock);
				return node;
			}
			raw_spin_unlock(&head->lock);
		}
next_cpu:
		cpu = cpumask_next(cpu, cpu_possible_mask);
		if (cpu >= nr_cpu_ids)
			cpu = 0;
@@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
	}

	/* cannot pop from per cpu lists, try extralist */
	if (!raw_spin_trylock(&s->extralist.lock))
	if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
		return NULL;
	node = s->extralist.first;
	if (node)
		s->extralist.first = node->next;
		WRITE_ONCE(s->extralist.first, node->next);
	raw_spin_unlock(&s->extralist.lock);
	return node;
}
+3 −1
Original line number Diff line number Diff line
@@ -560,6 +560,7 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
$(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
$(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -571,7 +572,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
		 $(OUTPUT)/bench_ringbufs.o \
		 $(OUTPUT)/bench_bloom_filter_map.o \
		 $(OUTPUT)/bench_bpf_loop.o \
		 $(OUTPUT)/bench_strncmp.o
		 $(OUTPUT)/bench_strncmp.o \
		 $(OUTPUT)/bench_bpf_hashmap_full_update.o
	$(call msg,BINARY,,$@)
	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@

+2 −0
Original line number Diff line number Diff line
@@ -396,6 +396,7 @@ extern const struct bench bench_hashmap_with_bloom;
extern const struct bench bench_bpf_loop;
extern const struct bench bench_strncmp_no_helper;
extern const struct bench bench_strncmp_helper;
extern const struct bench bench_bpf_hashmap_full_update;

static const struct bench *benchs[] = {
	&bench_count_global,
@@ -430,6 +431,7 @@ static const struct bench *benchs[] = {
	&bench_bpf_loop,
	&bench_strncmp_no_helper,
	&bench_strncmp_helper,
	&bench_bpf_hashmap_full_update,
};

static void setup_benchmark()
+96 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Bytedance */

#include <argp.h>
#include "bench.h"
#include "bpf_hashmap_full_update_bench.skel.h"
#include "bpf_util.h"

/* BPF triggering benchmarks */
static struct ctx {
	struct bpf_hashmap_full_update_bench *skel;
} ctx;

#define MAX_LOOP_NUM 10000

static void validate(void)
{
	if (env.consumer_cnt != 1) {
		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
		exit(1);
	}
}

static void *producer(void *input)
{
	while (true) {
		/* trigger the bpf program */
		syscall(__NR_getpgid);
	}

	return NULL;
}

static void *consumer(void *input)
{
	return NULL;
}

static void measure(struct bench_res *res)
{
}

static void setup(void)
{
	struct bpf_link *link;
	int map_fd, i, max_entries;

	setup_libbpf();

	ctx.skel = bpf_hashmap_full_update_bench__open_and_load();
	if (!ctx.skel) {
		fprintf(stderr, "failed to open skeleton\n");
		exit(1);
	}

	ctx.skel->bss->nr_loops = MAX_LOOP_NUM;

	link = bpf_program__attach(ctx.skel->progs.benchmark);
	if (!link) {
		fprintf(stderr, "failed to attach program!\n");
		exit(1);
	}

	/* fill hash_map */
	map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
	max_entries = bpf_map__max_entries(ctx.skel->maps.hash_map_bench);
	for (i = 0; i < max_entries; i++)
		bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
}

void hashmap_report_final(struct bench_res res[], int res_cnt)
{
	unsigned int nr_cpus = bpf_num_possible_cpus();
	int i;

	for (i = 0; i < nr_cpus; i++) {
		u64 time = ctx.skel->bss->percpu_time[i];

		if (!time)
			continue;

		printf("%d:hash_map_full_perf %lld events per sec\n",
		       i, ctx.skel->bss->nr_loops * 1000000000ll / time);
	}
}

const struct bench bench_bpf_hashmap_full_update = {
	.name = "bpf-hashmap-ful-update",
	.validate = validate,
	.setup = setup,
	.producer_thread = producer,
	.consumer_thread = consumer,
	.measure = measure,
	.report_progress = NULL,
	.report_final = hashmap_report_final,
};
+11 −0
Original line number Diff line number Diff line
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

source ./benchs/run_common.sh

set -eufo pipefail

nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
summary=$($RUN_BENCH -p $nr_threads bpf-hashmap-ful-update)
printf "$summary"
printf "\n"
Loading