Commit 49a0ee0e authored by Anshuman Khandual's avatar Anshuman Khandual Committed by Junhao He
Browse files

drivers: perf: arm_pmuv3: Enable branch stack sampling framework

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8EC9K
CVE: NA

Reference: https://lore.kernel.org/lkml/20230711082455.215983-6-anshuman.khandual@arm.com/



--------------------------------

Branch stack sampling support i.e capturing branch records during execution
in core perf, rides along with normal HW events being scheduled on the PMU.
This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
with required HW implementation.

ARMV8 PMU hardware support for branch stack sampling is indicated via a new
feature flag called 'has_branch_stack' that can be ascertained via probing.
This modifies current gate in armpmu_event_init() which blocks branch stack
sampling based perf events unconditionally. Instead allows such perf events
getting initialized on supporting PMU hardware.

Branch stack sampling is enabled and disabled along with regular PMU events
the relevant hardware also needs to be driven in tandem. This adds required
function callbacks in armv8pmu_branch_xxx() format, to drive the PMU branch
stack hardware when supported. This also adds fallback stub definitions for
these callbacks for PMUs which would not have required support.

Finally this adds a new buffer i.e 'struct branch_records', which can hold
captured branch records during PMU IRQ processing before being passed on to
the perf ring buffer. These buffers are per cpu, and dynamically allocated
only for supporting ARMV8 PMU. These buffers can hold 'MAX_BRANCH_RECORDS'
branch record entries.

This enables PERF_ATTACH_TASK_DATA for branch stack sampling perf events to
make them hold context branch records in their task_ctx_data. This will get
used to stash branch records that would have been lost when a given process
schedules out after a short run on the CPU without an event overflow.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: default avatarAnshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: default avatarJunhao He <hejunhao3@huawei.com>
parent 2b1b3624
Loading
Loading
Loading
Loading
+43 −0
Original line number Diff line number Diff line
@@ -239,12 +239,55 @@
/* PMMIR_EL1.SLOTS mask */
#define ARMV8_PMU_SLOTS_MASK	0xff

struct pmu_hw_events;
struct arm_pmu;
struct perf_event;

#ifdef CONFIG_PERF_EVENTS
struct pt_regs;
extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
extern unsigned long perf_misc_flags(struct pt_regs *regs);
#define perf_misc_flags(regs)	perf_misc_flags(regs)
#define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs

static inline void armv8pmu_branch_reset(void)
{
}

static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
{
}

static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
{
	return false;
}

static inline void armv8pmu_branch_enable(struct perf_event *event)
{
}

static inline void armv8pmu_branch_disable(struct perf_event *event)
{
}

static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
					struct perf_event *event)
{
}

static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
{
}

static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
{
	return 0;
}

static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
{
}
#endif

#define perf_arch_fetch_caller_regs(regs, __ip) { \
+89 −1
Original line number Diff line number Diff line
@@ -715,10 +715,16 @@ static void armv8pmu_enable_event(struct perf_event *event)
	 * Enable counter
	 */
	armv8pmu_enable_event_counter(event);

	if (has_branch_stack(event))
		armv8pmu_branch_enable(event);
}

static void armv8pmu_disable_event(struct perf_event *event)
{
	if (has_branch_stack(event))
		armv8pmu_branch_disable(event);

	/*
	 * Disable counter
	 */
@@ -792,6 +798,16 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
		if (!armpmu_event_set_period(event))
			continue;

		/*
		 * PMU IRQ should remain asserted until all branch records
		 * are captured and processed into struct perf_sample_data.
		 */
		if (has_branch_stack(event) && !WARN_ON(!cpuc->branches)) {
			armv8pmu_branch_read(cpuc, event);
			data.br_stack = &cpuc->branches->branch_stack;
			data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
		}

		/*
		 * Perf event overflow will queue the processing of the event as
		 * an irq_work which will be taken care of in the handling of
@@ -871,6 +887,24 @@ static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
		clear_bit(idx - 1, cpuc->used_mask);
}

static void armv8pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
	struct arm_pmu *armpmu = to_arm_pmu(ctx->pmu);
	void *task_ctx = ctx ? ctx->task_ctx_data : NULL;

	if (armpmu->has_branch_stack) {
		/* Save branch records in task_ctx on sched out */
		if (task_ctx && !sched_in) {
			armv8pmu_branch_save(armpmu, task_ctx);
			return;
		}

		/* Reset branch records on sched in */
		if (sched_in)
			armv8pmu_branch_reset();
	}
}

/*
 * Add an event filter to a given event.
 */
@@ -947,6 +981,9 @@ static void armv8pmu_reset(void *info)
		pmcr |= ARMV8_PMU_PMCR_LP;

	armv8pmu_pmcr_write(pmcr);

	if (cpu_pmu->has_branch_stack)
		armv8pmu_branch_reset();
}

static int __armv8_pmuv3_map_event(struct perf_event *event,
@@ -964,6 +1001,12 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
				       &armv8_pmuv3_perf_cache_map,
				       ARMV8_PMU_EVTYPE_EVENT);

	if (has_branch_stack(event)) {
		event->attach_state |= PERF_ATTACH_TASK_DATA;
		if (!armv8pmu_branch_attr_valid(event))
			return -EOPNOTSUPP;
	}

	if (armv8pmu_event_is_64bit(event))
		event->hw.flags |= ARMPMU_EVT_64BIT;

@@ -1056,6 +1099,35 @@ static void __armv8pmu_probe_pmu(void *info)
		cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1);
	else
		cpu_pmu->reg_pmmir = 0;
	armv8pmu_branch_probe(cpu_pmu);
}

static int branch_records_alloc(struct arm_pmu *armpmu)
{
	struct branch_records __percpu *records;
	int cpu;

	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
	if (!records)
		return -ENOMEM;

	/*
	 * percpu memory allocated for 'records' gets completely consumed
	 * here, and never required to be freed up later. So permanently
	 * losing access to this anchor i.e 'records' is acceptable.
	 *
	 * Otherwise this allocation handle would have to be saved up for
	 * free_percpu() release later if required.
	 */
	for_each_possible_cpu(cpu) {
		struct pmu_hw_events *events_cpu;
		struct branch_records *records_cpu;

		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
		records_cpu = per_cpu_ptr(records, cpu);
		events_cpu->branches = records_cpu;
	}
	return 0;
}

static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1072,7 +1144,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
	if (ret)
		return ret;

	return probe.present ? 0 : -ENODEV;
	if (!probe.present)
		return -ENODEV;

	if (cpu_pmu->has_branch_stack) {
		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
		if (ret)
			return ret;

		ret = branch_records_alloc(cpu_pmu);
		if (ret) {
			armv8pmu_task_ctx_cache_free(cpu_pmu);
			return ret;
		}
	}
	return 0;
}

static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
@@ -1097,6 +1183,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
	cpu_pmu->reset			= armv8pmu_reset;
	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
	cpu_pmu->filter_match		= armv8pmu_filter_match;
	cpu_pmu->sched_task		= armv8pmu_sched_task;
	cpu_pmu->branch_reset		= armv8pmu_branch_reset;

	cpu_pmu->name			= name;
	cpu_pmu->map_event		= map_event;
+17 −2
Original line number Diff line number Diff line
@@ -313,6 +313,11 @@ armpmu_del(struct perf_event *event, int flags)
	struct hw_perf_event *hwc = &event->hw;
	int idx = hwc->idx;

	WARN_ON_ONCE(!hw_events->brbe_users);
	hw_events->brbe_users--;
	if (!hw_events->brbe_users)
		hw_events->brbe_context = NULL;

	armpmu_stop(event, PERF_EF_UPDATE);
	hw_events->events[idx] = NULL;
	armpmu->clear_event_idx(hw_events, event);
@@ -329,6 +334,13 @@ armpmu_add(struct perf_event *event, int flags)
	struct hw_perf_event *hwc = &event->hw;
	int idx;

	if (event->ctx->task && hw_events->brbe_context != event->ctx) {
		hw_events->brbe_context = event->ctx;
		if (armpmu->branch_reset)
			armpmu->branch_reset();
	}
	hw_events->brbe_users++;

	/* An event following a process won't be stopped earlier */
	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
		return -ENOENT;
@@ -508,8 +520,11 @@ static int armpmu_event_init(struct perf_event *event)
		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
		return -ENOENT;

	/* does not support taken branch sampling */
	if (has_branch_stack(event))
	/*
	 * Branch stack sampling events are allowed
	 * only on PMU which has required support.
	 */
	if (has_branch_stack(event) && !armpmu->has_branch_stack)
		return -EOPNOTSUPP;

	if (armpmu->map_event(event) == -ENOENT)
+21 −1
Original line number Diff line number Diff line
@@ -41,6 +41,18 @@
	},								\
}

/*
 * Maximum branch record entries which could be processed
 * for core perf branch stack sampling support, regardless
 * of the hardware support available on a given ARM PMU.
 */
#define MAX_BRANCH_RECORDS 64

struct branch_records {
	struct perf_branch_stack	branch_stack;
	struct perf_branch_entry	branch_entries[MAX_BRANCH_RECORDS];
};

/* The events for a given PMU register set. */
struct pmu_hw_events {
	/*
@@ -67,6 +79,11 @@ struct pmu_hw_events {
	struct arm_pmu		*percpu_pmu;

	int irq;

	struct branch_records	*branches;
	void			*brbe_context;
	unsigned int		brbe_users;
	unsigned long		brbe_sample_type;
};

enum armpmu_attr_groups {
@@ -98,9 +115,12 @@ struct arm_pmu {
	void		(*reset)(void *);
	int		(*map_event)(struct perf_event *event);
	void		(*sched_task)(struct perf_event_context *ctx, bool sched_in);
	void		(*branch_reset)(void);
	int		(*filter_match)(struct perf_event *event);
	int		num_events;
	bool		secure_access; /* 32-bit ARM only */
	unsigned int	secure_access:1, /* 32-bit ARM only */
			has_branch_stack:1, /* 64-bit ARM only */
			reserved:30;
#define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE	0x4000