Commit 10742d0c authored by Namhyung Kim's avatar Namhyung Kim Committed by Arnaldo Carvalho de Melo
Browse files

perf record: Implement basic filtering for off-cpu



It should honor cpu and task filtering with -a, -C or -p, -t options.

Committer testing:

  # perf record --off-cpu --cpu 1 perf bench sched messaging -l 1000
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 1.722 [sec]
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 1.446 MB perf.data (7248 samples) ]
  #
  # perf script | head -20
              perf 97164 [001] 38287.696761:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696764:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696765:          9      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97164 [001] 38287.696767:        212      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97164 [001] 38287.696768:       5130      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97164 [001] 38287.696770:     123063      cycles:  ffffffffb6e0011e syscall_return_via_sysret+0x38 (vmlinux)
              perf 97164 [001] 38287.696803:    2292748      cycles:  ffffffffb636c82d __fput+0xad (vmlinux)
           swapper     0 [001] 38287.702852:    1927474      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97513 97513 [001] 38287.767207:    1172536      cycles:  ffffffffb612ff65 newidle_balance+0x5 (vmlinux)
           swapper     0 [001] 38287.769567:    1073081      cycles:  ffffffffb618216d ktime_get_mono_fast_ns+0xd (vmlinux)
            :97533 97533 [001] 38287.770962:     984460      cycles:  ffffffffb65b2900 selinux_socket_sendmsg+0x0 (vmlinux)
            :97540 97540 [001] 38287.772242:     883462      cycles:  ffffffffb6d0bf59 irqentry_exit_to_user_mode+0x9 (vmlinux)
           swapper     0 [001] 38287.773633:     741963      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97552 97552 [001] 38287.774539:     606680      cycles:  ffffffffb62eda0a page_add_file_rmap+0x7a (vmlinux)
            :97556 97556 [001] 38287.775333:     502254      cycles:  ffffffffb634f964 get_obj_cgroup_from_current+0xc4 (vmlinux)
            :97561 97561 [001] 38287.776163:     427891      cycles:  ffffffffb61b1522 cgroup_rstat_updated+0x22 (vmlinux)
           swapper     0 [001] 38287.776854:     359030      cycles:  ffffffffb612fc5e load_balance+0x9ce (vmlinux)
            :97567 97567 [001] 38287.777312:     330371      cycles:  ffffffffb6a8d8d0 skb_set_owner_w+0x0 (vmlinux)
            :97566 97566 [001] 38287.777589:     311622      cycles:  ffffffffb614a7a8 native_queued_spin_lock_slowpath+0x148 (vmlinux)
            :97512 97512 [001] 38287.777671:     307851      cycles:  ffffffffb62e0f35 find_vma+0x55 (vmlinux)
  #
  # perf record --off-cpu --cpu 4 perf bench sched messaging -l 1000
  # Running 'sched/messaging' benchmark:
  # 20 sender and receiver processes per group
  # 10 groups == 400 processes run

       Total time: 1.613 [sec]
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 1.415 MB perf.data (6729 samples) ]
  # perf script | head -20
              perf 97650 [004] 38323.728036:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728040:          1      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728041:          9      cycles:  ffffffffb6070174 native_write_msr+0x4 (vmlinux)
              perf 97650 [004] 38323.728042:        208      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97650 [004] 38323.728044:       5026      cycles:  ffffffffb6070176 native_write_msr+0x6 (vmlinux)
              perf 97650 [004] 38323.728046:     119970      cycles:  ffffffffb6d0bebc syscall_exit_to_user_mode+0x1c (vmlinux)
              perf 97650 [004] 38323.728078:    2190103      cycles:            54b756 perf_tool__process_synth_event+0x16 (/home/acme/bin/perf)
           swapper     0 [004] 38323.783357:    1593139      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.785352:    1593139      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.797330:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.802350:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
           swapper     0 [004] 38323.806333:    1418936      cycles:  ffffffffb6761378 mwait_idle_with_hints.constprop.0+0x48 (vmlinux)
            :97996 97996 [004] 38323.807145:    1418936      cycles:      7f5db9be6917 [unknown] ([unknown])
            :97959 97959 [004] 38323.807730:    1445074      cycles:  ffffffffb6329d36 memcg_slab_post_alloc_hook+0x146 (vmlinux)
            :97959 97959 [004] 38323.808103:    1341584      cycles:  ffffffffb62fd90f get_page_from_freelist+0x112f (vmlinux)
            :97959 97959 [004] 38323.808451:    1227537      cycles:  ffffffffb65b2905 selinux_socket_sendmsg+0x5 (vmlinux)
            :97959 97959 [004] 38323.808768:    1184321      cycles:  ffffffffb6d1ba35 _raw_spin_lock_irqsave+0x15 (vmlinux)
            :97959 97959 [004] 38323.809073:    1153017      cycles:  ffffffffb6a8d92d skb_set_owner_w+0x5d (vmlinux)
            :97959 97959 [004] 38323.809402:    1126875      cycles:  ffffffffb6329c64 memcg_slab_post_alloc_hook+0x74 (vmlinux)
            :97959 97959 [004] 38323.809695:    1073248      cycles:  ffffffffb6e0001d entry_SYSCALL_64+0x1d (vmlinux)
  #

Signed-off-by: default avatarNamhyung Kim <namhyung@kernel.org>
Tested-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-4-namhyung@kernel.org


Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent edc41a10
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -892,7 +892,7 @@ static int record__config_text_poke(struct evlist *evlist)

static int record__config_off_cpu(struct record *rec)
{
	return off_cpu_prepare(rec->evlist);
	return off_cpu_prepare(rec->evlist, &rec->opts.target);
}

static bool record__kcore_readable(struct machine *machine)
+70 −8
Original line number Diff line number Diff line
@@ -6,6 +6,9 @@
#include "util/off_cpu.h"
#include "util/perf-hooks.h"
#include "util/session.h"
#include "util/target.h"
#include "util/cpumap.h"
#include "util/thread_map.h"
#include <bpf/bpf.h>

#include "bpf_skel/off_cpu.skel.h"
@@ -60,8 +63,23 @@ static int off_cpu_config(struct evlist *evlist)
	return 0;
}

static void off_cpu_start(void *arg __maybe_unused)
static void off_cpu_start(void *arg)
{
	struct evlist *evlist = arg;

	/* update task filter for the given workload */
	if (!skel->bss->has_cpu && !skel->bss->has_task &&
	    perf_thread_map__pid(evlist->core.threads, 0) != -1) {
		int fd;
		u32 pid;
		u8 val = 1;

		skel->bss->has_task = 1;
		fd = bpf_map__fd(skel->maps.task_filter);
		pid = perf_thread_map__pid(evlist->core.threads, 0);
		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
	}

	skel->bss->enabled = 1;
}

@@ -71,31 +89,75 @@ static void off_cpu_finish(void *arg __maybe_unused)
	off_cpu_bpf__destroy(skel);
}

int off_cpu_prepare(struct evlist *evlist)
int off_cpu_prepare(struct evlist *evlist, struct target *target)
{
	int err;
	int err, fd, i;
	int ncpus = 1, ntasks = 1;

	if (off_cpu_config(evlist) < 0) {
		pr_err("Failed to config off-cpu BPF event\n");
		return -1;
	}

	set_max_rlimit();

	skel = off_cpu_bpf__open_and_load();
	skel = off_cpu_bpf__open();
	if (!skel) {
		pr_err("Failed to open off-cpu BPF skeleton\n");
		return -1;
	}

	/* don't need to set cpu filter for system-wide mode */
	if (target->cpu_list) {
		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
	}

	if (target__has_task(target)) {
		ntasks = perf_thread_map__nr(evlist->core.threads);
		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
	}

	set_max_rlimit();

	err = off_cpu_bpf__load(skel);
	if (err) {
		pr_err("Failed to load off-cpu skeleton\n");
		goto out;
	}

	if (target->cpu_list) {
		u32 cpu;
		u8 val = 1;

		skel->bss->has_cpu = 1;
		fd = bpf_map__fd(skel->maps.cpu_filter);

		for (i = 0; i < ncpus; i++) {
			cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
		}
	}

	if (target__has_task(target)) {
		u32 pid;
		u8 val = 1;

		skel->bss->has_task = 1;
		fd = bpf_map__fd(skel->maps.task_filter);

		for (i = 0; i < ntasks; i++) {
			pid = perf_thread_map__pid(evlist->core.threads, i);
			bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
		}
	}

	err = off_cpu_bpf__attach(skel);
	if (err) {
		pr_err("Failed to attach off-cpu BPF skeleton\n");
		goto out;
	}

	if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
	    perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
	if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
	    perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
		pr_err("Failed to attach off-cpu skeleton\n");
		goto out;
	}
+48 −4
Original line number Diff line number Diff line
@@ -49,12 +49,28 @@ struct {
	__uint(max_entries, MAX_ENTRIES);
} off_cpu SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
} cpu_filter SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
} task_filter SEC(".maps");

/* old kernel task_struct definition */
struct task_struct___old {
	long state;
} __attribute__((preserve_access_index));

int enabled = 0;
int has_cpu = 0;
int has_task = 0;

/*
 * Old kernel used to call it task_struct->state and now it's '__state'.
@@ -74,6 +90,37 @@ static inline int get_task_state(struct task_struct *t)
	return BPF_CORE_READ(t_old, state);
}

static inline int can_record(struct task_struct *t, int state)
{
	/* kernel threads don't have user stack */
	if (t->flags & PF_KTHREAD)
		return 0;

	if (state != TASK_INTERRUPTIBLE &&
	    state != TASK_UNINTERRUPTIBLE)
		return 0;

	if (has_cpu) {
		__u32 cpu = bpf_get_smp_processor_id();
		__u8 *ok;

		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
		if (!ok)
			return 0;
	}

	if (has_task) {
		__u8 *ok;
		__u32 pid = t->pid;

		ok = bpf_map_lookup_elem(&task_filter, &pid);
		if (!ok)
			return 0;
	}

	return 1;
}

SEC("tp_btf/sched_switch")
int on_switch(u64 *ctx)
{
@@ -92,10 +139,7 @@ int on_switch(u64 *ctx)

	ts = bpf_ktime_get_ns();

	if (prev->flags & PF_KTHREAD)
		goto next;
	if (state != TASK_INTERRUPTIBLE &&
	    state != TASK_UNINTERRUPTIBLE)
	if (!can_record(prev, state))
		goto next;

	stack_id = bpf_get_stackid(ctx, &stacks,
+4 −2
Original line number Diff line number Diff line
@@ -2,15 +2,17 @@
#define PERF_UTIL_OFF_CPU_H

struct evlist;
struct target;
struct perf_session;

#define OFFCPU_EVENT  "offcpu-time"

#ifdef HAVE_BPF_SKEL
int off_cpu_prepare(struct evlist *evlist);
int off_cpu_prepare(struct evlist *evlist, struct target *target);
int off_cpu_write(struct perf_session *session);
#else
static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused)
static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused,
				  struct target *target __maybe_unused)
{
	return -1;
}