Commit ec151037 authored by Joanne Koong's avatar Joanne Koong Committed by Alexei Starovoitov
Browse files

selftest/bpf/benchs: Add bpf_loop benchmark



Add benchmark to measure the throughput and latency of the bpf_loop
call.

Testing this on my dev machine on 1 thread, the data is as follows:

        nr_loops: 10
bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op

        nr_loops: 100
bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op

        nr_loops: 500
bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op

        nr_loops: 1000
bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op

        nr_loops: 5000
bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op

        nr_loops: 10000
bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op

        nr_loops: 50000
bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op

        nr_loops: 100000
bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op

        nr_loops: 500000
bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op

        nr_loops: 1000000
bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op

>From this data, we can see that the latency per loop decreases as the
number of loops increases. On this particular machine, each loop had an
overhead of about ~4 ns, and we were able to run ~250 million loops
per second.

Signed-off-by: default avatarJoanne Koong <joannekoong@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-5-joannekoong@fb.com
parent f6e659b7
Loading
Loading
Loading
Loading
+3 −1
Original line number Original line Diff line number Diff line
@@ -531,6 +531,7 @@ $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
			    $(OUTPUT)/perfbuf_bench.skel.h
			    $(OUTPUT)/perfbuf_bench.skel.h
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -540,7 +541,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
		 $(OUTPUT)/bench_rename.o \
		 $(OUTPUT)/bench_rename.o \
		 $(OUTPUT)/bench_trigger.o \
		 $(OUTPUT)/bench_trigger.o \
		 $(OUTPUT)/bench_ringbufs.o \
		 $(OUTPUT)/bench_ringbufs.o \
		 $(OUTPUT)/bench_bloom_filter_map.o
		 $(OUTPUT)/bench_bloom_filter_map.o \
		 $(OUTPUT)/bench_bpf_loop.o
	$(call msg,BINARY,,$@)
	$(call msg,BINARY,,$@)
	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@


+37 −0
Original line number Original line Diff line number Diff line
@@ -134,6 +134,39 @@ void hits_drops_report_final(struct bench_res res[], int res_cnt)
	       total_ops_mean, total_ops_stddev);
	       total_ops_mean, total_ops_stddev);
}
}


void ops_report_progress(int iter, struct bench_res *res, long delta_ns)
{
	double hits_per_sec, hits_per_prod;

	hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0);
	hits_per_prod = hits_per_sec / env.producer_cnt;

	printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0);

	printf("hits %8.3lfM/s (%7.3lfM/prod)\n", hits_per_sec, hits_per_prod);
}

void ops_report_final(struct bench_res res[], int res_cnt)
{
	double hits_mean = 0.0, hits_stddev = 0.0;
	int i;

	for (i = 0; i < res_cnt; i++)
		hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt);

	if (res_cnt > 1)  {
		for (i = 0; i < res_cnt; i++)
			hits_stddev += (hits_mean - res[i].hits / 1000000.0) *
				       (hits_mean - res[i].hits / 1000000.0) /
				       (res_cnt - 1.0);

		hits_stddev = sqrt(hits_stddev);
	}
	printf("Summary: throughput %8.3lf \u00B1 %5.3lf M ops/s (%7.3lfM ops/prod), ",
	       hits_mean, hits_stddev, hits_mean / env.producer_cnt);
	printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt);
}

const char *argp_program_version = "benchmark";
const char *argp_program_version = "benchmark";
const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
const char argp_program_doc[] =
const char argp_program_doc[] =
@@ -171,10 +204,12 @@ static const struct argp_option opts[] = {


extern struct argp bench_ringbufs_argp;
extern struct argp bench_ringbufs_argp;
extern struct argp bench_bloom_map_argp;
extern struct argp bench_bloom_map_argp;
extern struct argp bench_bpf_loop_argp;


static const struct argp_child bench_parsers[] = {
static const struct argp_child bench_parsers[] = {
	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
	{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
	{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
	{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
	{},
	{},
};
};


@@ -373,6 +408,7 @@ extern const struct bench bench_bloom_update;
extern const struct bench bench_bloom_false_positive;
extern const struct bench bench_bloom_false_positive;
extern const struct bench bench_hashmap_without_bloom;
extern const struct bench bench_hashmap_without_bloom;
extern const struct bench bench_hashmap_with_bloom;
extern const struct bench bench_hashmap_with_bloom;
extern const struct bench bench_bpf_loop;


static const struct bench *benchs[] = {
static const struct bench *benchs[] = {
	&bench_count_global,
	&bench_count_global,
@@ -404,6 +440,7 @@ static const struct bench *benchs[] = {
	&bench_bloom_false_positive,
	&bench_bloom_false_positive,
	&bench_hashmap_without_bloom,
	&bench_hashmap_without_bloom,
	&bench_hashmap_with_bloom,
	&bench_hashmap_with_bloom,
	&bench_bpf_loop,
};
};


static void setup_benchmark()
static void setup_benchmark()
+2 −0
Original line number Original line Diff line number Diff line
@@ -59,6 +59,8 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
void hits_drops_report_final(struct bench_res res[], int res_cnt);
void hits_drops_report_final(struct bench_res res[], int res_cnt);
void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
void false_hits_report_final(struct bench_res res[], int res_cnt);
void false_hits_report_final(struct bench_res res[], int res_cnt);
void ops_report_progress(int iter, struct bench_res *res, long delta_ns);
void ops_report_final(struct bench_res res[], int res_cnt);


static inline __u64 get_time_ns() {
static inline __u64 get_time_ns() {
	struct timespec t;
	struct timespec t;
+105 −0
Original line number Original line Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */

#include <argp.h>
#include "bench.h"
#include "bpf_loop_bench.skel.h"

/* BPF triggering benchmarks */
static struct ctx {
	struct bpf_loop_bench *skel;
} ctx;

static struct {
	__u32 nr_loops;
} args = {
	.nr_loops = 10,
};

enum {
	ARG_NR_LOOPS = 4000,
};

static const struct argp_option opts[] = {
	{ "nr_loops", ARG_NR_LOOPS, "nr_loops", 0,
		"Set number of loops for the bpf_loop helper"},
	{},
};

static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
	switch (key) {
	case ARG_NR_LOOPS:
		args.nr_loops = strtol(arg, NULL, 10);
		break;
	default:
		return ARGP_ERR_UNKNOWN;
	}

	return 0;
}

/* exported into benchmark runner */
const struct argp bench_bpf_loop_argp = {
	.options = opts,
	.parser = parse_arg,
};

static void validate(void)
{
	if (env.consumer_cnt != 1) {
		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
		exit(1);
	}
}

static void *producer(void *input)
{
	while (true)
		/* trigger the bpf program */
		syscall(__NR_getpgid);

	return NULL;
}

static void *consumer(void *input)
{
	return NULL;
}

static void measure(struct bench_res *res)
{
	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
}

static void setup(void)
{
	struct bpf_link *link;

	setup_libbpf();

	ctx.skel = bpf_loop_bench__open_and_load();
	if (!ctx.skel) {
		fprintf(stderr, "failed to open skeleton\n");
		exit(1);
	}

	link = bpf_program__attach(ctx.skel->progs.benchmark);
	if (!link) {
		fprintf(stderr, "failed to attach program!\n");
		exit(1);
	}

	ctx.skel->bss->nr_loops = args.nr_loops;
}

const struct bench bench_bpf_loop = {
	.name = "bpf-loop",
	.validate = validate,
	.setup = setup,
	.producer_thread = producer,
	.consumer_thread = consumer,
	.measure = measure,
	.report_progress = ops_report_progress,
	.report_final = ops_report_final,
};
+15 −0
Original line number Original line Diff line number Diff line
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

source ./benchs/run_common.sh

set -eufo pipefail

for t in 1 4 8 12 16; do
for i in 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do
subtitle "nr_loops: $i, nr_threads: $t"
	summarize_ops "bpf_loop: " \
	    "$($RUN_BENCH -p $t --nr_loops $i bpf-loop)"
	printf "\n"
done
done
Loading