Commit fa853c4b authored by Song Liu's avatar Song Liu Committed by Arnaldo Carvalho de Melo
Browse files

perf stat: Enable counting events for BPF programs



Introduce 'perf stat -b' option, which counts events for BPF programs, like:

  [root@localhost ~]# ~/perf stat -e ref-cycles,cycles -b 254 -I 1000
     1.487903822            115,200      ref-cycles
     1.487903822             86,012      cycles
     2.489147029             80,560      ref-cycles
     2.489147029             73,784      cycles
     3.490341825             60,720      ref-cycles
     3.490341825             37,797      cycles
     4.491540887             37,120      ref-cycles
     4.491540887             31,963      cycles

The example above counts 'cycles' and 'ref-cycles' of BPF program of id
254.  This is similar to bpftool-prog-profile command, but more
flexible.

'perf stat -b' creates per-cpu perf_event and loads fentry/fexit BPF
programs (monitor-progs) to the target BPF program (target-prog). The
monitor-progs read perf_event before and after the target-prog, and
aggregate the difference in a BPF map. Then the user space reads data
from these maps.

A new 'struct bpf_counter' is introduced to provide a common interface
that uses BPF programs/maps to count perf events.

Committer notes:

Removed all but bpf_counter.h includes from evsel.h, not needed at all.

Also BPF map lookups for PERCPU_ARRAYs need to have as its value receive
buffer passed to the kernel libbpf_num_possible_cpus() entries, not
evsel__nr_cpus(evsel), as the former uses
/sys/devices/system/cpu/possible while the later uses
/sys/devices/system/cpu/online, which may be less than the 'possible'
number making the bpf map lookup overwrite memory and cause hard to
debug memory corruption.

We need to continue using evsel__nr_cpus(evsel) when accessing the
perf_counts array tho, not to overwrite another are of memory :-)

Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Link: https://lore.kernel.org/lkml/20210120163031.GU12699@kernel.org/


Acked-by: default avatarNamhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@fb.com
Link: http://lore.kernel.org/lkml/20201229214214.3413833-4-songliubraving@fb.com


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent fbcdaa19
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -75,6 +75,24 @@ report::
--tid=<tid>::
        stat events on existing thread id (comma separated list)

-b::
--bpf-prog::
        stat events on existing bpf program id (comma separated list),
        requiring root rights. bpftool-prog could be used to find program
        id all bpf programs in the system. For example:

  # bpftool prog | head -n 1
  17247: tracepoint  name sys_enter  tag 192d548b9d754067  gpl

  # perf stat -e cycles,instructions --bpf-prog 17247 --timeout 1000

   Performance counter stats for 'BPF program(s) 17247':

             85,967      cycles
             28,982      instructions              #    0.34  insn per cycle

        1.102235068 seconds time elapsed

ifdef::HAVE_LIBPFM[]
--pfm-events events::
Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
+1 −1
Original line number Diff line number Diff line
@@ -1015,7 +1015,7 @@ python-clean:

SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
SKELETONS :=
SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h

ifdef BUILD_BPF_SKEL
BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
+68 −14
Original line number Diff line number Diff line
@@ -67,6 +67,7 @@
#include "util/top.h"
#include "util/affinity.h"
#include "util/pfm.h"
#include "util/bpf_counter.h"
#include "asm/bug.h"

#include <linux/time64.h>
@@ -409,12 +410,32 @@ static int read_affinity_counters(struct timespec *rs)
	return 0;
}

static int read_bpf_map_counters(void)
{
	struct evsel *counter;
	int err;

	evlist__for_each_entry(evsel_list, counter) {
		err = bpf_counter__read(counter);
		if (err)
			return err;
	}
	return 0;
}

static void read_counters(struct timespec *rs)
{
	struct evsel *counter;
	int err;

	if (!stat_config.stop_read_counter && (read_affinity_counters(rs) < 0))
	if (!stat_config.stop_read_counter) {
		if (target__has_bpf(&target))
			err = read_bpf_map_counters();
		else
			err = read_affinity_counters(rs);
		if (err < 0)
			return;
	}

	evlist__for_each_entry(evsel_list, counter) {
		if (counter->err)
@@ -496,11 +517,22 @@ static bool handle_interval(unsigned int interval, int *times)
	return false;
}

static void enable_counters(void)
static int enable_counters(void)
{
	struct evsel *evsel;
	int err;

	if (target__has_bpf(&target)) {
		evlist__for_each_entry(evsel_list, evsel) {
			err = bpf_counter__enable(evsel);
			if (err)
				return err;
		}
	}

	if (stat_config.initial_delay < 0) {
		pr_info(EVLIST_DISABLED_MSG);
		return;
		return 0;
	}

	if (stat_config.initial_delay > 0) {
@@ -518,6 +550,7 @@ static void enable_counters(void)
		if (stat_config.initial_delay > 0)
			pr_info(EVLIST_ENABLED_MSG);
	}
	return 0;
}

static void disable_counters(void)
@@ -720,7 +753,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
	const bool forks = (argc > 0);
	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
	struct affinity affinity;
	int i, cpu;
	int i, cpu, err;
	bool second_pass = false;

	if (forks) {
@@ -737,6 +770,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
	if (affinity__setup(&affinity) < 0)
		return -1;

	if (target__has_bpf(&target)) {
		evlist__for_each_entry(evsel_list, counter) {
			if (bpf_counter__load(counter, &target))
				return -1;
		}
	}

	evlist__for_each_cpu (evsel_list, i, cpu) {
		affinity__set(&affinity, cpu);

@@ -850,7 +890,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
	}

	if (STAT_RECORD) {
		int err, fd = perf_data__fd(&perf_stat.data);
		int fd = perf_data__fd(&perf_stat.data);

		if (is_pipe) {
			err = perf_header__write_pipe(perf_data__fd(&perf_stat.data));
@@ -876,7 +916,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)

	if (forks) {
		evlist__start_workload(evsel_list);
		enable_counters();
		err = enable_counters();
		if (err)
			return -1;

		if (interval || timeout || evlist__ctlfd_initialized(evsel_list))
			status = dispatch_events(forks, timeout, interval, &times);
@@ -895,7 +937,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
		if (WIFSIGNALED(status))
			psignal(WTERMSIG(status), argv[0]);
	} else {
		enable_counters();
		err = enable_counters();
		if (err)
			return -1;
		status = dispatch_events(forks, timeout, interval, &times);
	}

@@ -1085,6 +1129,10 @@ static struct option stat_options[] = {
		   "stat events on existing process id"),
	OPT_STRING('t', "tid", &target.tid, "tid",
		   "stat events on existing thread id"),
#ifdef HAVE_BPF_SKEL
	OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id",
		   "stat events on existing bpf program id"),
#endif
	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
		    "system-wide collection from all CPUs"),
	OPT_BOOLEAN('g', "group", &group,
@@ -2064,11 +2112,12 @@ int cmd_stat(int argc, const char **argv)
		"perf stat [<options>] [<command>]",
		NULL
	};
	int status = -EINVAL, run_idx;
	int status = -EINVAL, run_idx, err;
	const char *mode;
	FILE *output = stderr;
	unsigned int interval, timeout;
	const char * const stat_subcommands[] = { "record", "report" };
	char errbuf[BUFSIZ];

	setlocale(LC_ALL, "");

@@ -2179,6 +2228,12 @@ int cmd_stat(int argc, const char **argv)
	} else if (big_num_opt == 0) /* User passed --no-big-num */
		stat_config.big_num = false;

	err = target__validate(&target);
	if (err) {
		target__strerror(&target, err, errbuf, BUFSIZ);
		pr_warning("%s\n", errbuf);
	}

	setup_system_wide(argc);

	/*
@@ -2252,8 +2307,6 @@ int cmd_stat(int argc, const char **argv)
		}
	}

	target__validate(&target);

	if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide))
		target.per_thread = true;

@@ -2384,7 +2437,8 @@ int cmd_stat(int argc, const char **argv)
		 * tools remain  -acme
		 */
		int fd = perf_data__fd(&perf_stat.data);
		int err = perf_event__synthesize_kernel_mmap((void *)&perf_stat,

		err = perf_event__synthesize_kernel_mmap((void *)&perf_stat,
							 process_synthesized_event,
							 &perf_stat.session->machines.host);
		if (err) {
+1 −0
Original line number Diff line number Diff line
@@ -135,6 +135,7 @@ perf-y += clockid.o

perf-$(CONFIG_LIBBPF) += bpf-loader.o
perf-$(CONFIG_LIBBPF) += bpf_map.o
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
perf-$(CONFIG_LIBELF) += symbol-elf.o
perf-$(CONFIG_LIBELF) += probe-file.o
+314 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

/* Copyright (c) 2019 Facebook */

#include <assert.h>
#include <limits.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <linux/err.h>
#include <linux/zalloc.h>
#include <bpf/bpf.h>
#include <bpf/btf.h>
#include <bpf/libbpf.h>

#include "bpf_counter.h"
#include "counts.h"
#include "debug.h"
#include "evsel.h"
#include "target.h"

#include "bpf_skel/bpf_prog_profiler.skel.h"

static inline void *u64_to_ptr(__u64 ptr)
{
	return (void *)(unsigned long)ptr;
}

static void set_max_rlimit(void)
{
	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };

	setrlimit(RLIMIT_MEMLOCK, &rinf);
}

static struct bpf_counter *bpf_counter_alloc(void)
{
	struct bpf_counter *counter;

	counter = zalloc(sizeof(*counter));
	if (counter)
		INIT_LIST_HEAD(&counter->list);
	return counter;
}

static int bpf_program_profiler__destroy(struct evsel *evsel)
{
	struct bpf_counter *counter, *tmp;

	list_for_each_entry_safe(counter, tmp,
				 &evsel->bpf_counter_list, list) {
		list_del_init(&counter->list);
		bpf_prog_profiler_bpf__destroy(counter->skel);
		free(counter);
	}
	assert(list_empty(&evsel->bpf_counter_list));

	return 0;
}

static char *bpf_target_prog_name(int tgt_fd)
{
	struct bpf_prog_info_linear *info_linear;
	struct bpf_func_info *func_info;
	const struct btf_type *t;
	char *name = NULL;
	struct btf *btf;

	info_linear = bpf_program__get_prog_info_linear(
		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
	if (IS_ERR_OR_NULL(info_linear)) {
		pr_debug("failed to get info_linear for prog FD %d\n", tgt_fd);
		return NULL;
	}

	if (info_linear->info.btf_id == 0 ||
	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
		pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd);
		goto out;
	}

	func_info = u64_to_ptr(info_linear->info.func_info);
	t = btf__type_by_id(btf, func_info[0].type_id);
	if (!t) {
		pr_debug("btf %d doesn't have type %d\n",
			 info_linear->info.btf_id, func_info[0].type_id);
		goto out;
	}
	name = strdup(btf__name_by_offset(btf, t->name_off));
out:
	free(info_linear);
	return name;
}

static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id)
{
	struct bpf_prog_profiler_bpf *skel;
	struct bpf_counter *counter;
	struct bpf_program *prog;
	char *prog_name;
	int prog_fd;
	int err;

	prog_fd = bpf_prog_get_fd_by_id(prog_id);
	if (prog_fd < 0) {
		pr_err("Failed to open fd for bpf prog %u\n", prog_id);
		return -1;
	}
	counter = bpf_counter_alloc();
	if (!counter) {
		close(prog_fd);
		return -1;
	}

	skel = bpf_prog_profiler_bpf__open();
	if (!skel) {
		pr_err("Failed to open bpf skeleton\n");
		goto err_out;
	}

	skel->rodata->num_cpu = evsel__nr_cpus(evsel);

	bpf_map__resize(skel->maps.events, evsel__nr_cpus(evsel));
	bpf_map__resize(skel->maps.fentry_readings, 1);
	bpf_map__resize(skel->maps.accum_readings, 1);

	prog_name = bpf_target_prog_name(prog_fd);
	if (!prog_name) {
		pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n", prog_id);
		goto err_out;
	}

	bpf_object__for_each_program(prog, skel->obj) {
		err = bpf_program__set_attach_target(prog, prog_fd, prog_name);
		if (err) {
			pr_err("bpf_program__set_attach_target failed.\n"
			       "Does bpf prog %u have BTF?\n", prog_id);
			goto err_out;
		}
	}
	set_max_rlimit();
	err = bpf_prog_profiler_bpf__load(skel);
	if (err) {
		pr_err("bpf_prog_profiler_bpf__load failed\n");
		goto err_out;
	}

	assert(skel != NULL);
	counter->skel = skel;
	list_add(&counter->list, &evsel->bpf_counter_list);
	close(prog_fd);
	return 0;
err_out:
	bpf_prog_profiler_bpf__destroy(skel);
	free(counter);
	close(prog_fd);
	return -1;
}

static int bpf_program_profiler__load(struct evsel *evsel, struct target *target)
{
	char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p;
	u32 prog_id;
	int ret;

	bpf_str_ = bpf_str = strdup(target->bpf_str);
	if (!bpf_str)
		return -1;

	while ((tok = strtok_r(bpf_str, ",", &saveptr)) != NULL) {
		prog_id = strtoul(tok, &p, 10);
		if (prog_id == 0 || prog_id == UINT_MAX ||
		    (*p != '\0' && *p != ',')) {
			pr_err("Failed to parse bpf prog ids %s\n",
			       target->bpf_str);
			return -1;
		}

		ret = bpf_program_profiler_load_one(evsel, prog_id);
		if (ret) {
			bpf_program_profiler__destroy(evsel);
			free(bpf_str_);
			return -1;
		}
		bpf_str = NULL;
	}
	free(bpf_str_);
	return 0;
}

static int bpf_program_profiler__enable(struct evsel *evsel)
{
	struct bpf_counter *counter;
	int ret;

	list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
		assert(counter->skel != NULL);
		ret = bpf_prog_profiler_bpf__attach(counter->skel);
		if (ret) {
			bpf_program_profiler__destroy(evsel);
			return ret;
		}
	}
	return 0;
}

static int bpf_program_profiler__read(struct evsel *evsel)
{
	// perf_cpu_map uses /sys/devices/system/cpu/online
	int num_cpu = evsel__nr_cpus(evsel);
	// BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible
	// Sometimes possible > online, like on a Ryzen 3900X that has 24
	// threads but its possible showed 0-31 -acme
	int num_cpu_bpf = libbpf_num_possible_cpus();
	struct bpf_perf_event_value values[num_cpu_bpf];
	struct bpf_counter *counter;
	int reading_map_fd;
	__u32 key = 0;
	int err, cpu;

	if (list_empty(&evsel->bpf_counter_list))
		return -EAGAIN;

	for (cpu = 0; cpu < num_cpu; cpu++) {
		perf_counts(evsel->counts, cpu, 0)->val = 0;
		perf_counts(evsel->counts, cpu, 0)->ena = 0;
		perf_counts(evsel->counts, cpu, 0)->run = 0;
	}
	list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
		struct bpf_prog_profiler_bpf *skel = counter->skel;

		assert(skel != NULL);
		reading_map_fd = bpf_map__fd(skel->maps.accum_readings);

		err = bpf_map_lookup_elem(reading_map_fd, &key, values);
		if (err) {
			pr_err("failed to read value\n");
			return err;
		}

		for (cpu = 0; cpu < num_cpu; cpu++) {
			perf_counts(evsel->counts, cpu, 0)->val += values[cpu].counter;
			perf_counts(evsel->counts, cpu, 0)->ena += values[cpu].enabled;
			perf_counts(evsel->counts, cpu, 0)->run += values[cpu].running;
		}
	}
	return 0;
}

static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
					    int fd)
{
	struct bpf_prog_profiler_bpf *skel;
	struct bpf_counter *counter;
	int ret;

	list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
		skel = counter->skel;
		assert(skel != NULL);

		ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events),
					  &cpu, &fd, BPF_ANY);
		if (ret)
			return ret;
	}
	return 0;
}

struct bpf_counter_ops bpf_program_profiler_ops = {
	.load       = bpf_program_profiler__load,
	.enable	    = bpf_program_profiler__enable,
	.read       = bpf_program_profiler__read,
	.destroy    = bpf_program_profiler__destroy,
	.install_pe = bpf_program_profiler__install_pe,
};

int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
{
	if (list_empty(&evsel->bpf_counter_list))
		return 0;
	return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
}

int bpf_counter__load(struct evsel *evsel, struct target *target)
{
	if (target__has_bpf(target))
		evsel->bpf_counter_ops = &bpf_program_profiler_ops;

	if (evsel->bpf_counter_ops)
		return evsel->bpf_counter_ops->load(evsel, target);
	return 0;
}

int bpf_counter__enable(struct evsel *evsel)
{
	if (list_empty(&evsel->bpf_counter_list))
		return 0;
	return evsel->bpf_counter_ops->enable(evsel);
}

int bpf_counter__read(struct evsel *evsel)
{
	if (list_empty(&evsel->bpf_counter_list))
		return -EAGAIN;
	return evsel->bpf_counter_ops->read(evsel);
}

void bpf_counter__destroy(struct evsel *evsel)
{
	if (list_empty(&evsel->bpf_counter_list))
		return;
	evsel->bpf_counter_ops->destroy(evsel);
	evsel->bpf_counter_ops = NULL;
}
Loading