Commit 6c6848e7 authored by Anshuman Khandual's avatar Anshuman Khandual Committed by Junhao He
Browse files

drivers: perf: arm_pmuv3: Enable branch stack sampling via FEAT_BRBE

maillist inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8EC9K
CVE: NA

Reference: https://lore.kernel.org/linux-arm-kernel/20240613061731.3109448-1-anshuman.khandual@arm.com/



--------------------------------

This extends recently added branch stack sampling framework in ARMV8 PMU to
enable such events via new architecture feature called Branch Record Buffer
Extension aka BRBE. This implements all the armv8pmu_branch_xxx() callbacks
as expected at ARMV8 PMU level required to drive perf branch stack sampling
events. This adds a new config option CONFIG_ARM64_BRBE to encapsulate this
BRBE based implementation, available only on ARM64 platforms.

BRBE hardware captures a branch record via three distinct system registers
representing branch source address, branch target address, and other branch
information. A BRBE buffer implementation is organized as multiple banks of
32 branch records each, which is a collection of BRBSRC_EL1, BRBTGT_EL1 and
BRBINF_EL1 registers. Though total BRBE record entries i.e BRBE_MAX_ENTRIES
cannot exceed MAX_BRANCH_RECORDS as defined for ARM PMU.

Branch stack sampling is enabled and disabled along with regular PMU events
. This adds required function callbacks in armv8pmu_branch_xxx() format, to
drive the PMU branch stack hardware when supported. This also adds fallback
stub definitions for these callbacks for PMUs which would not have required
support.

BRBE hardware attributes get captured in a new reg_brbidr element in struct
arm_pmu during armv8pmu_branch_probe() which is called from broader probing
function __armv8pmu_probe_pmu(). Attributes such as number of branch record
entries implemented in the hardware can be derived from armpmu->reg_brbidr.

BRBE gets enabled via armv8pmu_branch_enable() where it also derives branch
filter, and additional requirements from event's 'attr.branch_sample_type'
and configures them via BRBFCR_EL1 and BRBCR_EL1 registers.

PMU event overflow triggers IRQ, where current branch records get captured,
stitched along with older records available in 'task_ctx', before getting
processed for core perf ring buffer. Task context switch outs incrementally
save current branch records in event's 'pmu_ctx->task_ctx_data' to optimize
workload's branch record samples.

In case multiple events with different branch sample type requests converge
on the same PMU, BRBE gets enabled for the merged branch filter accommoding
all those event's branch sample type. Captured branch records get filterted
in software for an overflown event if BRBE hardware config does not match
its branch sample type, while handling the PMU IRQ.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: default avatarAnshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: default avatarJunhao He <hejunhao3@huawei.com>
parent 9192278e
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -180,6 +180,18 @@ config ARM_SPE_PMU
	  Extension, which provides periodic sampling of operations in
	  the CPU pipeline and reports this via the perf AUX interface.

config ARM64_BRBE
	bool "Enable support for branch stack sampling using FEAT_BRBE"
	depends on PERF_EVENTS && ARM64 && ARM_PMU
	depends on !FUNCTION_ALIGNMENT_64B
	default y
	help
	  Enable perf support for Branch Record Buffer Extension (BRBE) which
	  records all branches taken in an execution path. This supports some
	  branch types and privilege based filtering. It captures additional
	  relevant information such as cycle count, misprediction and branch
	  type, branch privilege level etc.

config ARM_DMC620_PMU
	tristate "Enable PMU support for the ARM DMC-620 memory controller"
	depends on (ARM64 && ACPI) || COMPILE_TEST
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
obj-$(CONFIG_ARM64_BRBE) += arm_brbe.o
obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
+1198 −0

File added.

Preview size limit exceeded, changes collapsed.

+159 −1
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@
#include <linux/nmi.h>

#include <asm/arm_pmuv3.h>
#include "arm_pmuv3_branch.h"

/* ARMv8 Cortex-A53 specific event types. */
#define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
@@ -834,14 +835,70 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);

	kvm_vcpu_pmu_resync_el0();
	if (cpu_pmu->has_branch_stack)
		armv8pmu_branch_enable(cpu_pmu);
}

static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
{
	if (cpu_pmu->has_branch_stack)
		armv8pmu_branch_disable();

	/* Disable all counters */
	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
}

static void read_branch_records(struct pmu_hw_events *cpuc,
				struct perf_event *event,
				struct perf_sample_data *data,
				bool *branch_captured)
{
	struct branch_records event_records;

	/*
	 * CPU specific branch records buffer must have been allocated already
	 * for the hardware records to be captured and processed further.
	 */
	if (WARN_ON(!cpuc->branches))
		return;

	/*
	 * When the current task context does not match with the PMU overflown
	 * event, the captured branch records here cannot be co-related to the
	 * overflowed event. Report to the user - as if no branch records have
	 * been captured, and flush branch records.
	 */
	if (event->ctx->task && (cpuc->branch_context != event->ctx))
		return;

	/*
	 * Read the branch records from the hardware once after the PMU IRQ
	 * has been triggered but subsequently same records can be used for
	 * other events that might have been overflowed simultaneously thus
	 * saving much CPU cycles.
	 */
	if (!*branch_captured) {
		armv8pmu_branch_read(cpuc, event);
		*branch_captured = true;
	}

	/*
	 * Filter captured branch records
	 *
	 * PMU captured branch records would contain samples applicable for
	 * the aggregated branch filters, for all events that got scheduled
	 * on this PMU simultaneously. Hence these branch records need to
	 * be filtered first so that each individual event get samples they
	 * had requested originally.
	 */
	if (cpuc->branch_sample_type != event->attr.branch_sample_type) {
		arm64_filter_branch_records(cpuc, event, &event_records);
		perf_sample_save_brstack(data, event, &event_records.branch_stack, NULL);
		return;
	}
	perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack, NULL);
}

static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
{
	u32 pmovsr;
@@ -849,6 +906,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
	struct pt_regs *regs;
	int idx;
	bool branch_captured = false;

	/*
	 * Get and reset the IRQ flags
@@ -892,6 +950,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
		if (!armpmu_event_set_period(event))
			continue;

		/*
		 * PMU IRQ should remain asserted until all branch records
		 * are captured and processed into struct perf_sample_data.
		 */
		if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
			read_branch_records(cpuc, event, &data, &branch_captured);

		/*
		 * Perf event overflow will queue the processing of the event as
		 * an irq_work which will be taken care of in the handling of
@@ -901,6 +966,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
			cpu_pmu->disable(event);
	}
	armv8pmu_start(cpu_pmu);
	if (cpu_pmu->has_branch_stack)
		armv8pmu_branch_stack_reset();

	return IRQ_HANDLED;
}
@@ -991,6 +1058,40 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
	return event->hw.idx;
}

static bool armv8pmu_branch_stack_init(struct perf_event *event)
{
	if (armv8pmu_branch_attr_valid(event)) {
		/*
		 * If a task gets scheduled out, the current branch records
		 * get saved in the task's context data, which can be later
		 * used to fill in the records upon an event overflow. Let's
		 * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
		 * all branch stack sampling perf events.
		 */
		event->attach_state |= PERF_ATTACH_TASK_DATA;
		return true;
	}
	return false;
}

static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
	struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
	void *task_ctx = pmu_ctx->task_ctx_data;

	if (armpmu->has_branch_stack) {
		/* Save branch records in task_ctx on sched out */
		if (task_ctx && !sched_in) {
			armv8pmu_branch_save(armpmu, task_ctx);
			return;
		}

		/* Reset branch records on sched in */
		if (sched_in)
			armv8pmu_branch_stack_reset();
	}
}

/*
 * Add an event filter to a given event.
 */
@@ -1083,6 +1184,9 @@ static void armv8pmu_reset(void *info)
		pmcr |= ARMV8_PMU_PMCR_LP;

	armv8pmu_pmcr_write(pmcr);

	if (cpu_pmu->has_branch_stack)
		armv8pmu_branch_stack_reset();
}

static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
@@ -1235,6 +1339,41 @@ static void __armv8pmu_probe_pmu(void *info)
		cpu_pmu->reg_pmmir = read_pmmir();
	else
		cpu_pmu->reg_pmmir = 0;

	/*
	 * BRBE is being probed on a single cpu for a
	 * given PMU. The remaining cpus, are assumed
	 * to have the exact same BRBE implementation.
	 */
	armv8pmu_branch_probe(cpu_pmu);
}

static int branch_records_alloc(struct arm_pmu *armpmu)
{
	struct branch_records __percpu *records;
	int cpu;

	records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
	if (!records)
		return -ENOMEM;

	/*
	 * percpu memory allocated for 'records' gets completely consumed
	 * here, and never required to be freed up later. So permanently
	 * losing access to this anchor i.e 'records' is acceptable.
	 *
	 * Otherwise this allocation handle would have to be saved up for
	 * free_percpu() release later if required.
	 */
	for_each_possible_cpu(cpu) {
		struct pmu_hw_events *events_cpu;
		struct branch_records *records_cpu;

		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
		records_cpu = per_cpu_ptr(records, cpu);
		events_cpu->branches = records_cpu;
	}
	return 0;
}

static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1251,7 +1390,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
	if (ret)
		return ret;

	return probe.present ? 0 : -ENODEV;
	if (!probe.present)
		return -ENODEV;

	if (cpu_pmu->has_branch_stack) {
		ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
		if (ret)
			return ret;

		ret = branch_records_alloc(cpu_pmu);
		if (ret) {
			armv8pmu_task_ctx_cache_free(cpu_pmu);
			return ret;
		}
	}
	return 0;
}

static void armv8pmu_disable_user_access_ipi(void *unused)
@@ -1311,6 +1464,11 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;

	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
	cpu_pmu->sched_task		= armv8pmu_sched_task;
	cpu_pmu->branch_stack_init	= armv8pmu_branch_stack_init;
	cpu_pmu->branch_stack_add	= armv8pmu_branch_stack_add;
	cpu_pmu->branch_stack_del	= armv8pmu_branch_stack_del;
	cpu_pmu->branch_stack_reset	= armv8pmu_branch_stack_reset;

	cpu_pmu->name			= name;
	cpu_pmu->map_event		= map_event;
+83 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Branch Record Buffer Extension Helpers.
 *
 * Copyright (C) 2022-2023 ARM Limited
 *
 * Author: Anshuman Khandual <anshuman.khandual@arm.com>
 */
#include <linux/perf/arm_pmu.h>

#ifdef CONFIG_ARM64_BRBE
void armv8pmu_branch_stack_add(struct perf_event *event, struct pmu_hw_events *cpuc);
void armv8pmu_branch_stack_del(struct perf_event *event, struct pmu_hw_events *cpuc);
void armv8pmu_branch_stack_reset(void);
void armv8pmu_branch_probe(struct arm_pmu *arm_pmu);
bool armv8pmu_branch_attr_valid(struct perf_event *event);
void armv8pmu_branch_enable(struct arm_pmu *arm_pmu);
void armv8pmu_branch_disable(void);
void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
			  struct perf_event *event);
void arm64_filter_branch_records(struct pmu_hw_events *cpuc,
				 struct perf_event *event,
				 struct branch_records *event_records);
void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx);
int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu);
void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu);
#else
static inline void armv8pmu_branch_stack_add(struct perf_event *event, struct pmu_hw_events *cpuc)
{
}

static inline void armv8pmu_branch_stack_del(struct perf_event *event, struct pmu_hw_events *cpuc)
{
}

static inline void armv8pmu_branch_stack_reset(void)
{
}

static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
{
}

static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
{
	WARN_ON_ONCE(!has_branch_stack(event));
	return false;
}

static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
{
}

static inline void armv8pmu_branch_disable(void)
{
}

static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
					struct perf_event *event)
{
	WARN_ON_ONCE(!has_branch_stack(event));
}

static inline void arm64_filter_branch_records(struct pmu_hw_events *cpuc,
					       struct perf_event *event,
					       struct branch_records *event_records)
{

}

static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
{
}

static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
{
	return 0;
}

static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
{
}
#endif
Loading