Commit d465bff1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'perf-tools-for-v6.1-1-2022-10-07' of...

Merge tag 'perf-tools-for-v6.1-1-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux

Pull perf tools updates from Arnaldo Carvalho de Melo:

 - Add support for AMD on 'perf mem' and 'perf c2c', the kernel
   enablement patches went via tip.

   Example:

      $ sudo perf mem record -- -c 10000
      ^C[ perf record: Woken up 227 times to write data ]
      [ perf record: Captured and wrote 58.760 MB perf.data (836978 samples) ]

      $ sudo perf mem report -F mem,sample,snoop
      Samples: 836K of event 'ibs_op//', Event count (approx.): 8418762
      Memory access                  Samples  Snoop
      N/A                             700620  N/A
      L1 hit                          126675  N/A
      L2 hit                             424  N/A
      L3 hit                             664  HitM
      L3 hit                              10  N/A
      Local RAM hit                        2  N/A
      Remote RAM (1 hop) hit            8558  N/A
      Remote Cache (1 hop) hit             3  N/A
      Remote Cache (1 hop) hit             2  HitM
      Remote Cache (2 hops) hit           10  HitM
      Remote Cache (2 hops) hit            6  N/A
      Uncached hit                         4  N/A
      $

 - "perf lock" improvements:

     - Add -E/--entries option to limit the number of entries to
       display, say to ask for just the top 5 contended locks.

     - Add -q/--quiet option to suppress header and debug messages.

     - Add a 'perf test' kernel lock contention entry to test 'perf
       lock'.

 - "perf lock contention" improvements:

     - Ask BPF's bpf_get_stackid() to skip some callchain entries.

       The ones closer to the tooling are bpf related and not that
       interesting, the ones calling the locking function are the ones
       we're interested in, example of a full, unskipped callstack:

     - Allow changing the callstack depth and number of entries to skip.

           1     10.74 us     10.74 us     10.74 us     spinlock   __bpf_trace_contention_begin+0xb
                          0xffffffffc03b5c47  bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117
                          0xffffffffc03b5c47  bpf_prog_bf07ae9e2cbd02c5_contention_begin+0x117
                          0xffffffffbb8b8e75  bpf_trace_run2+0x35
                          0xffffffffbb7eab9b  __bpf_trace_contention_begin+0xb
                          0xffffffffbb7ebe75  queued_spin_lock_slowpath+0x1f5
                          0xffffffffbc1c26ff  _raw_spin_lock+0x1f
                          0xffffffffbb841015  tick_do_update_jiffies64+0x25
                          0xffffffffbb8409ee  tick_irq_enter+0x9e

     - Show full callstack in verbose mode (-v option), sometimes this
       is desirable instead of showing just one callstack entry.

 - Allow multiple time ranges in 'perf record --delay' to help in
   reducing the amount of data collected from hardware tracing (Intel
   PT, etc) when there is a rough idea of periods of time where events
   of interest take time.

 - Add Intel PT to record only decoder debug messages when error
   happens.

 - Improve layout of Intel PT man page.

 - Add new branch types: alignment, data and inst faults and arch
   specific ones, such as fiq, debug_halt, debug_exit, debug_inst and
   debug_data on arm64.

   Kernel enablement went thru the tip tree.

 - Fix 'perf probe' error log check in 'perf test' when no debuginfo is
   available.

 - Fix 'perf stat' aggregation mode logic, it should be looking at the
   CPU not at the core number.

 - Fix flags parsing in 'perf trace' filters.

 - Introduce compact encoding of CPU range encoding on perf.data, to
   avoid having a bitmap with all the CPUs.

 - Improvements to the 'perf stat' metrics, including adding
   "core_wide", and computing "smt" from the CPU topology.

 - Add support to the new PERF_FORMAT_LOST perf_event_attr.read_format,
   that allows tooling to ask for the precise number of lost samples for
   a given event.

 - Add 'addr' sort key to see just the address of sampled instructions:

      $ perf record -o- true | perf report -i- -s addr
      [ perf record: Woken up 1 times to write data ]
      [ perf record: Captured and wrote 0.000 MB - ]
      # Samples: 12  of event 'cycles:u'
      # Event count (approx.): 252512
      #
      # Overhead  Address
      # ........  ..................
          42.96%  0x7f96f08443d7
          29.55%  0x7f96f0859b50
          14.76%  0x7f96f0852e02
           8.30%  0x7f96f0855028
           4.43%  0xffffffff8de01087

      perf annotate: Toggle full address <-> offset display

 - Add 'f' hotkey to the 'perf annotate' TUI interface when in
   'disassembler output' mode ('o' hotkey) to toggle showing full
   virtual address or just the offset.

 - Cache DSO build-ids when synthesizing PERF_RECORD_MMAP records for
   pre-existing threads, at the start of a 'perf record' session,
   speeding up that record startup phase.

 - Add a command line option to specify build ids in 'perf inject'.

 - Update JSON event files for the Intel alderlake, broadwell,
   broadwellde, broadwellx, cascadelakex, haswell, haswellx, icelake,
   icelakex, ivybridge, ivytown, jaketown, sandybridge, sapphirerapids,
   skylake, skylakex, and tigerlake processors.

 - Update vendor JSON event files for the ARM Neoverse V1 and E1
   platforms.

 - Add a 'perf test' entry for 'perf mem' where a struct has false
   sharing and this gets detected in the 'perf mem' output, tested with
   Intel, AMD and ARM64 systems.

 - Add a 'perf test' entry to test the resolution of java symbols, where
   an output like this is expected:

       8.18%  jshell    jitted-50116-29.so    [.] Interpreter
       0.75%  Thread-1  jitted-83602-1670.so  [.] jdk.internal.jimage.BasicImageReader.getString(int)

 - Add tests for the ARM64 CoreSight hardware tracing feature, with
   specially crafted pureloop, memcpy, thread loop and unroll tread that
   then gets traced and the output compared with expected output.

   Documentation explaining it is also included.

 - Add per thread Intel PT 'perf test' entry to check that
   PERF_RECORD_TEXT_POKE events are recorded per CPU, resulting in a
   mixture of per thread and per CPU events and mmaps, verify that this
   gets all recorded correctly.

 - Introduce pthread mutex wrappers to allow for building with clang's
   -Wthread-safety, i.e. using the "guarded_by" "pt_guarded_by"
   "lockable", "exclusive_lock_function", "exclusive_trylock_function",
   "exclusive_locks_required", and "no_thread_safety_analysis" compiler
   function attributes.

 - Fix empty version number when building outside of a git repo.

 - Improve feature detection display when multiple versions of a feature
   are present, such as for binutils libbfd, that has a mix of possible
   ways to detect according to the Linux distribution.

   Previously in some cases we had:

      Auto-detecting system features
      <SNIP>
      ...                                  libbfd: [ on  ]
      ...                          libbfd-liberty: [ on  ]
      ...                        libbfd-liberty-z: [ on  ]
      <SNIP>

   Now for this case we show just the main feature:

      Auto-detecting system features
      <SNIP>
      ...                                  libbfd: [ on  ]
      <SNIP>

 - Remove some unused structs, variables, macros, function prototypes
   and includes from various places.

* tag 'perf-tools-for-v6.1-1-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (169 commits)
  perf script: Add missing fields in usage hint
  perf mem: Print "LFB/MAB" for PERF_MEM_LVLNUM_LFB
  perf mem/c2c: Avoid printing empty lines for unsupported events
  perf mem/c2c: Add load store event mappings for AMD
  perf mem/c2c: Set PERF_SAMPLE_WEIGHT for LOAD_STORE events
  perf mem: Add support for printing PERF_MEM_LVLNUM_{CXL|IO}
  perf amd ibs: Sync arch/x86/include/asm/amd-ibs.h header with the kernel
  tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel
  perf stat: Fix cpu check to use id.cpu.cpu in aggr_printout()
  perf test coresight: Add relevant documentation about ARM64 CoreSight testing
  perf test: Add git ignore for tmp and output files of ARM CoreSight tests
  perf test coresight: Add unroll thread test shell script
  perf test coresight: Add unroll thread test tool
  perf test coresight: Add thread loop test shell scripts
  perf test coresight: Add thread loop test tool
  perf test coresight: Add memcpy thread test shell script
  perf test coresight: Add memcpy thread test tool
  perf test: Add git ignore for perf data generated by the ARM CoreSight tests
  perf test: Add arm64 asm pureloop test shell script
  perf test: Add asm pureloop test tool
  ...
parents 041bc24d d7931070
Loading
Loading
Loading
Loading
+158 −0
Original line number Diff line number Diff line
.. SPDX-License-Identifier: GPL-2.0

================
CoreSight - Perf
================

    :Author:   Carsten Haitzler <carsten.haitzler@arm.com>
    :Date:     June 29th, 2022

Perf is able to locally access CoreSight trace data and store it to the
output perf data files. This data can then be later decoded to give the
instructions that were traced for debugging or profiling purposes. You
can log such data with a perf record command like::

   perf record -e cs_etm//u testbinary

This would run some test binary (testbinary) until it exits and record
a perf.data trace file. That file would have AUX sections if CoreSight
is working correctly. You can dump the content of this file as
readable text with a command like::

   perf report --stdio --dump -i perf.data

You should find some sections of this file have AUX data blocks like::

   0x1e78 [0x30]: PERF_RECORD_AUXTRACE size: 0x11dd0  offset: 0  ref: 0x1b614fc1061b0ad1  idx: 0  tid: 531230  cpu: -1

   . ... CoreSight ETM Trace data: size 73168 bytes
           Idx:0; ID:10;   I_ASYNC : Alignment Synchronisation.
             Idx:12; ID:10;  I_TRACE_INFO : Trace Info.; INFO=0x0 { CC.0 }
             Idx:17; ID:10;  I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.; Addr=0x0000000000000000;
             Idx:26; ID:10;  I_TRACE_ON : Trace On.
             Idx:27; ID:10;  I_ADDR_CTXT_L_64IS0 : Address & Context, Long, 64 bit, IS0.; Addr=0x0000FFFFB6069140; Ctxt: AArch64,EL0, NS;
             Idx:38; ID:10;  I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE
             Idx:39; ID:10;  I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE
             Idx:40; ID:10;  I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEEEEEEEEEEEEEE
             Idx:41; ID:10;  I_ATOM_F6 : Atom format 6.; EEEEEEEEEEEN
             ...

If you see these above, then your system is tracing CoreSight data
correctly.

To compile perf with CoreSight support in the tools/perf directory do::

    make CORESIGHT=1

This requires OpenCSD to build. You may install distribution packages
for the support such as libopencsd and libopencsd-dev or download it
and build yourself. Upstream OpenCSD is located at:

  https://github.com/Linaro/OpenCSD

For complete information on building perf with CoreSight support and
more extensive usage look at:

  https://github.com/Linaro/OpenCSD/blob/master/HOWTO.md


Kernel CoreSight Support
------------------------

You will also want CoreSight support enabled in your kernel config.
Ensure it is enabled with::

   CONFIG_CORESIGHT=y

There are various other CoreSight options you probably also want
enabled like::

   CONFIG_CORESIGHT_LINKS_AND_SINKS=y
   CONFIG_CORESIGHT_LINK_AND_SINK_TMC=y
   CONFIG_CORESIGHT_CATU=y
   CONFIG_CORESIGHT_SINK_TPIU=y
   CONFIG_CORESIGHT_SINK_ETBV10=y
   CONFIG_CORESIGHT_SOURCE_ETM4X=y
   CONFIG_CORESIGHT_CTI=y
   CONFIG_CORESIGHT_CTI_INTEGRATION_REGS=y

Please refer to the kernel configuration help for more information.

Perf test - Verify kernel and userspace perf CoreSight work
-----------------------------------------------------------

When you run perf test, it will do a lot of self tests. Some of those
tests will cover CoreSight (only if enabled and on ARM64). You
generally would run perf test from the tools/perf directory in the
kernel tree. Some tests will check some internal perf support like:

   Check Arm CoreSight trace data recording and synthesized samples
   Check Arm SPE trace data recording and synthesized samples

Some others will actually use perf record and some test binaries that
are in tests/shell/coresight and will collect traces to ensure a
minimum level of functionality is met. The scripts that launch these
tests are in the same directory. These will all look like:

   CoreSight / ASM Pure Loop
   CoreSight / Memcpy 16k 10 Threads
   CoreSight / Thread Loop 10 Threads - Check TID
   etc.

These perf record tests will not run if the tool binaries do not exist
in tests/shell/coresight/\*/ and will be skipped. If you do not have
CoreSight support in hardware then either do not build perf with
CoreSight support or remove these binaries in order to not have these
tests fail and have them skip instead.

These tests will log historical results in the current working
directory (e.g. tools/perf) and will be named stats-\*.csv like:

   stats-asm_pure_loop-out.csv
   stats-memcpy_thread-16k_10.csv
   ...

These statistic files log some aspects of the AUX data sections in
the perf data output counting some numbers of certain encodings (a
good way to know that it's working in a very simple way). One problem
with CoreSight is that given a large enough amount of data needing to
be logged, some of it can be lost due to the processor not waking up
in time to read out all the data from buffers etc.. You will notice
that the amount of data collected can vary a lot per run of perf test.
If you wish to see how this changes over time, simply run perf test
multiple times and all these csv files will have more and more data
appended to it that you can later examine, graph and otherwise use to
figure out if things have become worse or better.

This means sometimes these tests fail as they don't capture all the
data needed. This is about tracking quality and amount of data
produced over time and to see when changes to the Linux kernel improve
quality of traces.

Be aware that some of these tests take quite a while to run, specifically
in processing the perf data file and dumping contents to then examine what
is inside.

You can change where these csv logs are stored by setting the
PERF_TEST_CORESIGHT_STATDIR environment variable before running perf
test like::

   export PERF_TEST_CORESIGHT_STATDIR=/var/tmp
   perf test

They will also store resulting perf output data in the current
directory for later inspection like::

   perf-asm_pure_loop-out.data
   perf-memcpy_thread-16k_10.data
   ...

You can alter where the perf data files are stored by setting the
PERF_TEST_CORESIGHT_DATADIR environment variable such as::

   PERF_TEST_CORESIGHT_DATADIR=/var/tmp
   perf test

You may wish to set these above environment variables if you wish to
keep the output of tests outside of the current working directory for
longer term storage and examination.
+1 −0
Original line number Diff line number Diff line
@@ -2067,6 +2067,7 @@ F: drivers/hwtracing/coresight/*
F:	include/dt-bindings/arm/coresight-cti-dt.h
F:	include/linux/coresight*
F:	samples/coresight/*
F:	tools/perf/tests/shell/coresight/*
F:	tools/perf/arch/arm/util/auxtrace.c
F:	tools/perf/arch/arm/util/cs-etm.c
F:	tools/perf/arch/arm/util/cs-etm.h
+16 −0
Original line number Diff line number Diff line
@@ -6,6 +6,22 @@

#include "msr-index.h"

/* IBS_OP_DATA2 DataSrc */
#define IBS_DATA_SRC_LOC_CACHE			 2
#define IBS_DATA_SRC_DRAM			 3
#define IBS_DATA_SRC_REM_CACHE			 4
#define IBS_DATA_SRC_IO				 7

/* IBS_OP_DATA2 DataSrc Extension */
#define IBS_DATA_SRC_EXT_LOC_CACHE		 1
#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE		 2
#define IBS_DATA_SRC_EXT_DRAM			 3
#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE		 5
#define IBS_DATA_SRC_EXT_PMEM			 6
#define IBS_DATA_SRC_EXT_IO			 7
#define IBS_DATA_SRC_EXT_EXT_MEM		 8
#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM		12

/*
 * IBS Hardware MSRs
 */
+35 −15
Original line number Diff line number Diff line
@@ -137,6 +137,12 @@ FEATURE_DISPLAY ?= \
         libaio			\
         libzstd

#
# Declare group members of a feature to display the logical OR of the detection
# result instead of each member result.
#
FEATURE_GROUP_MEMBERS-libbfd = libbfd-liberty libbfd-liberty-z

# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
# If in the future we need per-feature checks/flags for features not
# mentioned in this list we need to refactor this ;-).
@@ -177,19 +183,28 @@ endif
#
# Print the result of the feature test:
#
feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG))
feature_print_status = $(eval $(feature_print_status_code))

feature_group = $(eval $(feature_gen_group)) $(GROUP)

define feature_gen_group
  GROUP := $(1)
  ifneq ($(feature_verbose),1)
    GROUP += $(FEATURE_GROUP_MEMBERS-$(1))
  endif
endef

define feature_print_status_code
  ifeq ($(feature-$(1)), 1)
    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
  ifneq (,$(filter 1,$(foreach feat,$(call feature_group,$(feat)),$(feature-$(feat)))))
    MSG = $(shell printf '...%40s: [ \033[32mon\033[m  ]' $(1))
  else
    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
    MSG = $(shell printf '...%40s: [ \033[31mOFF\033[m ]' $(1))
  endif
endef

feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG))
feature_print_text = $(eval $(feature_print_text_code))
define feature_print_text_code
    MSG = $(shell printf '...%30s: %s' $(1) $(2))
    MSG = $(shell printf '...%40s: %s' $(1) $(2))
endef

#
@@ -244,24 +259,29 @@ ifeq ($(VF),1)
  feature_verbose := 1
endif

ifneq ($(feature_verbose),1)
  #
  # Determine the features to omit from the displayed message, as only the
  # logical OR of the detection result will be shown.
  #
  FEATURE_OMIT := $(foreach feat,$(FEATURE_DISPLAY),$(FEATURE_GROUP_MEMBERS-$(feat)))
endif

feature_display_entries = $(eval $(feature_display_entries_code))
define feature_display_entries_code
  ifeq ($(feature_display),1)
    $(info )
    $(info Auto-detecting system features:)
    $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
    ifneq ($(feature_verbose),1)
      $(info )
    endif
    $$(info )
    $$(info Auto-detecting system features:)
    $(foreach feat,$(filter-out $(FEATURE_OMIT),$(FEATURE_DISPLAY)),$(call feature_print_status,$(feat),) $$(info $(MSG)))
  endif

  ifeq ($(feature_verbose),1)
    TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
    $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
    $(info )
    $(eval TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS)))
    $(foreach feat,$(TMP),$(call feature_print_status,$(feat),) $$(info $(MSG)))
  endif
endef

ifeq ($(FEATURE_DISPLAY_DEFERRED),)
  $(call feature_display_entries)
  $(info )
endif
+38 −2
Original line number Diff line number Diff line
@@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift {

	PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT	= 17, /* save low level index of raw branch records */

	PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT	= 18, /* save privilege mode */

	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
};

@@ -233,6 +235,8 @@ enum perf_branch_sample_type {

	PERF_SAMPLE_BRANCH_HW_INDEX	= 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT,

	PERF_SAMPLE_BRANCH_PRIV_SAVE	= 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT,

	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
};

@@ -253,9 +257,37 @@ enum {
	PERF_BR_COND_RET	= 10,	/* conditional function return */
	PERF_BR_ERET		= 11,	/* exception return */
	PERF_BR_IRQ		= 12,	/* irq */
	PERF_BR_SERROR		= 13,	/* system error */
	PERF_BR_NO_TX		= 14,	/* not in transaction */
	PERF_BR_EXTEND_ABI	= 15,	/* extend ABI */
	PERF_BR_MAX,
};

enum {
	PERF_BR_NEW_FAULT_ALGN		= 0,    /* Alignment fault */
	PERF_BR_NEW_FAULT_DATA		= 1,    /* Data fault */
	PERF_BR_NEW_FAULT_INST		= 2,    /* Inst fault */
	PERF_BR_NEW_ARCH_1		= 3,    /* Architecture specific */
	PERF_BR_NEW_ARCH_2		= 4,    /* Architecture specific */
	PERF_BR_NEW_ARCH_3		= 5,    /* Architecture specific */
	PERF_BR_NEW_ARCH_4		= 6,    /* Architecture specific */
	PERF_BR_NEW_ARCH_5		= 7,    /* Architecture specific */
	PERF_BR_NEW_MAX,
};

enum {
	PERF_BR_PRIV_UNKNOWN	= 0,
	PERF_BR_PRIV_USER	= 1,
	PERF_BR_PRIV_KERNEL	= 2,
	PERF_BR_PRIV_HV		= 3,
};

#define PERF_BR_ARM64_FIQ              PERF_BR_NEW_ARCH_1
#define PERF_BR_ARM64_DEBUG_HALT       PERF_BR_NEW_ARCH_2
#define PERF_BR_ARM64_DEBUG_EXIT       PERF_BR_NEW_ARCH_3
#define PERF_BR_ARM64_DEBUG_INST       PERF_BR_NEW_ARCH_4
#define PERF_BR_ARM64_DEBUG_DATA       PERF_BR_NEW_ARCH_5

#define PERF_SAMPLE_BRANCH_PLM_ALL \
	(PERF_SAMPLE_BRANCH_USER|\
	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1295,7 +1327,9 @@ union perf_mem_data_src {
#define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
#define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
#define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
/* 5-0xa available */
/* 5-0x8 available */
#define PERF_MEM_LVLNUM_CXL	0x09 /* CXL */
#define PERF_MEM_LVLNUM_IO	0x0a /* I/O */
#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
#define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
#define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
@@ -1373,7 +1407,9 @@ struct perf_branch_entry {
		abort:1,    /* transaction abort */
		cycles:16,  /* cycle count to last branch */
		type:4,     /* branch type */
		reserved:40;
		new_type:4, /* additional branch type */
		priv:3,     /* privilege level */
		reserved:33;
};

union perf_sample_weight {
Loading