Commit d41bc48b authored by Andrii Nakryiko's avatar Andrii Nakryiko Committed by Daniel Borkmann
Browse files

selftests/bpf: Add uprobe triggering overhead benchmarks



Add benchmark to measure overhead of uprobes and uretprobes. Also have
a baseline (no uprobe attached) benchmark.

On my dev machine, baseline benchmark can trigger 130M user_target()
invocations. When uprobe is attached, this falls to just 700K. With
uretprobe, we get down to 520K:

  $ sudo ./bench trig-uprobe-base -a
  Summary: hits  131.289 ± 2.872M/s

  # UPROBE
  $ sudo ./bench -a trig-uprobe-without-nop
  Summary: hits    0.729 ± 0.007M/s

  $ sudo ./bench -a trig-uprobe-with-nop
  Summary: hits    1.798 ± 0.017M/s

  # URETPROBE
  $ sudo ./bench -a trig-uretprobe-without-nop
  Summary: hits    0.508 ± 0.012M/s

  $ sudo ./bench -a trig-uretprobe-with-nop
  Summary: hits    0.883 ± 0.008M/s

So there is almost 2.5x performance difference between probing nop vs
non-nop instruction for entry uprobe. And 1.7x difference for uretprobe.

This means that non-nop uprobe overhead is around 1.4 microseconds for uprobe
and 2 microseconds for non-nop uretprobe.

For nop variants, uprobe and uretprobe overhead is down to 0.556 and
1.13 microseconds, respectively.

For comparison, just doing a very low-overhead syscall (with no BPF
programs attached anywhere) gives:

  $ sudo ./bench trig-base -a
  Summary: hits    4.830 ± 0.036M/s

So uprobes are about 2.67x slower than pure context switch.

Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211116013041.4072571-1-andrii@kernel.org
parent ebf7f6f0
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -533,7 +533,9 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
		 $(OUTPUT)/testing_helpers.o \
		 $(OUTPUT)/trace_helpers.o \
		 $(OUTPUT)/bench_count.o \
		 $(OUTPUT)/bench_rename.o \
		 $(OUTPUT)/bench_trigger.o \
+10 −0
Original line number Diff line number Diff line
@@ -359,6 +359,11 @@ extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
extern const struct bench bench_trig_fentry_sleep;
extern const struct bench bench_trig_fmodret;
extern const struct bench bench_trig_uprobe_base;
extern const struct bench bench_trig_uprobe_with_nop;
extern const struct bench bench_trig_uretprobe_with_nop;
extern const struct bench bench_trig_uprobe_without_nop;
extern const struct bench bench_trig_uretprobe_without_nop;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
extern const struct bench bench_pb_libbpf;
@@ -385,6 +390,11 @@ static const struct bench *benchs[] = {
	&bench_trig_fentry,
	&bench_trig_fentry_sleep,
	&bench_trig_fmodret,
	&bench_trig_uprobe_base,
	&bench_trig_uprobe_with_nop,
	&bench_trig_uretprobe_with_nop,
	&bench_trig_uprobe_without_nop,
	&bench_trig_uretprobe_without_nop,
	&bench_rb_libbpf,
	&bench_rb_custom,
	&bench_pb_libbpf,
+146 −0
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
/* Copyright (c) 2020 Facebook */
#include "bench.h"
#include "trigger_bench.skel.h"
#include "trace_helpers.h"

/* BPF triggering benchmarks */
static struct trigger_ctx {
@@ -107,6 +108,101 @@ static void *trigger_consumer(void *input)
	return NULL;
}

/* make sure call is not inlined and not avoided by compiler, so __weak and
 * inline asm volatile in the body of the function
 *
 * There is a performance difference between uprobing at nop location vs other
 * instructions. So use two different targets, one of which starts with nop
 * and another doesn't.
 *
 * GCC doesn't generate stack setup preample for these functions due to them
 * having no input arguments and doing nothing in the body.
 */
__weak void uprobe_target_with_nop(void)
{
	asm volatile ("nop");
}

__weak void uprobe_target_without_nop(void)
{
	asm volatile ("");
}

static void *uprobe_base_producer(void *input)
{
	while (true) {
		uprobe_target_with_nop();
		atomic_inc(&base_hits.value);
	}
	return NULL;
}

static void *uprobe_producer_with_nop(void *input)
{
	while (true)
		uprobe_target_with_nop();
	return NULL;
}

static void *uprobe_producer_without_nop(void *input)
{
	while (true)
		uprobe_target_without_nop();
	return NULL;
}

static void usetup(bool use_retprobe, bool use_nop)
{
	size_t uprobe_offset;
	ssize_t base_addr;
	struct bpf_link *link;

	setup_libbpf();

	ctx.skel = trigger_bench__open_and_load();
	if (!ctx.skel) {
		fprintf(stderr, "failed to open skeleton\n");
		exit(1);
	}

	base_addr = get_base_addr();
	if (use_nop)
		uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop, base_addr);
	else
		uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop, base_addr);

	link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
					  use_retprobe,
					  -1 /* all PIDs */,
					  "/proc/self/exe",
					  uprobe_offset);
	if (!link) {
		fprintf(stderr, "failed to attach uprobe!\n");
		exit(1);
	}
	ctx.skel->links.bench_trigger_uprobe = link;
}

static void uprobe_setup_with_nop()
{
	usetup(false, true);
}

static void uretprobe_setup_with_nop()
{
	usetup(true, true);
}

static void uprobe_setup_without_nop()
{
	usetup(false, false);
}

static void uretprobe_setup_without_nop()
{
	usetup(true, false);
}

const struct bench bench_trig_base = {
	.name = "trig-base",
	.validate = trigger_validate,
@@ -182,3 +278,53 @@ const struct bench bench_trig_fmodret = {
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};

const struct bench bench_trig_uprobe_base = {
	.name = "trig-uprobe-base",
	.setup = NULL, /* no uprobe/uretprobe is attached */
	.producer_thread = uprobe_base_producer,
	.consumer_thread = trigger_consumer,
	.measure = trigger_base_measure,
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};

const struct bench bench_trig_uprobe_with_nop = {
	.name = "trig-uprobe-with-nop",
	.setup = uprobe_setup_with_nop,
	.producer_thread = uprobe_producer_with_nop,
	.consumer_thread = trigger_consumer,
	.measure = trigger_measure,
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};

const struct bench bench_trig_uretprobe_with_nop = {
	.name = "trig-uretprobe-with-nop",
	.setup = uretprobe_setup_with_nop,
	.producer_thread = uprobe_producer_with_nop,
	.consumer_thread = trigger_consumer,
	.measure = trigger_measure,
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};

const struct bench bench_trig_uprobe_without_nop = {
	.name = "trig-uprobe-without-nop",
	.setup = uprobe_setup_without_nop,
	.producer_thread = uprobe_producer_without_nop,
	.consumer_thread = trigger_consumer,
	.measure = trigger_measure,
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};

const struct bench bench_trig_uretprobe_without_nop = {
	.name = "trig-uretprobe-without-nop",
	.setup = uretprobe_setup_without_nop,
	.producer_thread = uprobe_producer_without_nop,
	.consumer_thread = trigger_consumer,
	.measure = trigger_measure,
	.report_progress = hits_drops_report_progress,
	.report_final = hits_drops_report_final,
};
+7 −0
Original line number Diff line number Diff line
@@ -52,3 +52,10 @@ int bench_trigger_fmodret(void *ctx)
	__sync_add_and_fetch(&hits, 1);
	return -22;
}

SEC("uprobe/self/uprobe_target")
int bench_trigger_uprobe(void *ctx)
{
	__sync_add_and_fetch(&hits, 1);
	return 0;
}