From 8d48794bb3bf7d7e421204a8cc3bd5c95ffc609b Mon Sep 17 00:00:00 2001 From: Mihai Mihalache Date: Wed, 16 Mar 2016 08:21:12 -0700 Subject: [PATCH 001/705] regulator: gpio: check return value of of_get_named_gpio At boot time the regulator driver can be initialized before the gpio, in which case the call to of_get_named_gpio will return EPROBE_DEFER. This value is silently passed to regulator_register which will return success, although the gpio is not registered (regulator_ena_gpio_request not called) as the value passed is detected as invalid. The gpio_regulator_probe will therefore succeed win no gpio requested. Signed-off-by: Mihai Mihalache Reviewed-by: Hans Holmberg Signed-off-by: Mark Brown --- drivers/regulator/gpio-regulator.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c index a8718e98674a2..83e89e5d47526 100644 --- a/drivers/regulator/gpio-regulator.c +++ b/drivers/regulator/gpio-regulator.c @@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np, of_property_read_u32(np, "startup-delay-us", &config->startup_delay); config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0); + if (config->enable_gpio == -EPROBE_DEFER) + return ERR_PTR(-EPROBE_DEFER); /* Fetch GPIOs. - optional property*/ ret = of_gpio_count(np); -- GitLab From 2596e07a3ed5a5f4d8b89be316c2b704d6f5dc5f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 17 Mar 2016 18:23:40 +0100 Subject: [PATCH 002/705] regmap: fix documentation to match code The regmap binding talks about one thing, which is register endianess, and it gets almost every aspect of it wrong. This replaces the current text of the file with a version that makes more sense and that matches what we implement now. Signed-off-by: Arnd Bergmann Fixes: a06c488da0b0 ("regmap: Add explict native endian flag to DT bindings") Fixes: 275876e208e2 ("regmap: Add the DT binding documentation for endianness") Signed-off-by: Mark Brown --- .../devicetree/bindings/regmap/regmap.txt | 59 ++++++------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt index e98a9652ccc8c..0127be360fe85 100644 --- a/Documentation/devicetree/bindings/regmap/regmap.txt +++ b/Documentation/devicetree/bindings/regmap/regmap.txt @@ -1,50 +1,29 @@ -Device-Tree binding for regmap - -The endianness mode of CPU & Device scenarios: -Index Device Endianness properties ---------------------------------------------------- -1 BE 'big-endian' -2 LE 'little-endian' -3 Native 'native-endian' - -For one device driver, which will run in different scenarios above -on different SoCs using the devicetree, we need one way to simplify -this. +Devicetree binding for regmap Optional properties: -- {big,little,native}-endian: these are boolean properties, if absent - then the implementation will choose a default based on the device - being controlled. These properties are for register values and all - the buffers only. Native endian means that the CPU and device have - the same endianness. -Examples: -Scenario 1 : CPU in LE mode & device in LE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; + little-endian, + big-endian, + native-endian: See common-properties.txt for a definition -Scenario 2 : CPU in LE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... - big-endian; -}; +Note: +Regmap defaults to little-endian register access on MMIO based +devices, this is by far the most common setting. On CPU +architectures that typically run big-endian operating systems +(e.g. PowerPC), registers can be defined as big-endian and must +be marked that way in the devicetree. -Scenario 3 : CPU in BE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; +On SoCs that can be operated in both big-endian and little-endian +modes, with a single hardware switch controlling both the endianess +of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS +chips), "native-endian" is used to allow using the same device tree +blob in both cases. -Scenario 4 : CPU in BE mode & device in LE mode. +Examples: +Scenario 1 : a register set in big-endian mode. dev: dev@40031000 { - compatible = "name"; + compatible = "syscon"; reg = <0x40031000 0x1000>; + big-endian; ... - little-endian; }; -- GitLab From 9f9f8b863ad130ec0c25f378bdbad64ba71291de Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Mar 2016 12:13:12 +0000 Subject: [PATCH 003/705] regmap: mmio: Fix value endianness selection Currently when selecting value endianness we check the register endiannes, not the value endianness. Reported-by: Alexander Stein Tested-by: Alexander Stein Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 7526906ca080f..b27573c69af74 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -245,7 +245,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, ctx->val_bytes = config->val_bits / 8; ctx->clk = ERR_PTR(-ENODEV); - switch (config->reg_format_endian) { + switch (config->val_format_endian) { case REGMAP_ENDIAN_DEFAULT: case REGMAP_ENDIAN_LITTLE: #ifdef __LITTLE_ENDIAN -- GitLab From 3b672623079bb3e5685b8549e514f2dfaa564406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 28 Mar 2016 13:09:56 +0900 Subject: [PATCH 004/705] regulator: s2mps11: Fix invalid selector mask and voltages for buck9 The buck9 regulator of S2MPS11 PMIC had incorrect vsel_mask (0xff instead of 0x1f) thus reading entire register as buck9's voltage. This effectively caused regulator core to interpret values as higher voltages than they were and then to set real voltage much lower than intended. The buck9 provides power to other regulators, including LDO13 and LDO19 which supply the MMC2 (SD card). On Odroid XU3/XU4 the lower voltage caused SD card detection errors on Odroid XU3/XU4: mmc1: card never left busy state mmc1: error -110 whilst initialising SD card During driver probe the regulator core was checking whether initial voltage matches the constraints. With incorrect vsel_mask of 0xff and default value of 0x50, the core interpreted this as 5 V which is outside of constraints (3-3.775 V). Then the regulator core was adjusting the voltage to match the constraints. With incorrect vsel_mask this new voltage mapped to a vere low voltage in the driver. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Javier Martinez Canillas Tested-by: Javier Martinez Canillas Signed-off-by: Mark Brown Cc: --- drivers/regulator/s2mps11.c | 28 ++++++++++++++++++++++------ include/linux/mfd/samsung/s2mps11.h | 2 ++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c index d24e2c783dc5c..6dfa3502e1f1a 100644 --- a/drivers/regulator/s2mps11.c +++ b/drivers/regulator/s2mps11.c @@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } -#define regulator_desc_s2mps11_buck6_10(num, min, step) { \ +#define regulator_desc_s2mps11_buck67810(num, min, step) { \ .name = "BUCK"#num, \ .id = S2MPS11_BUCK##num, \ .ops = &s2mps11_buck_ops, \ @@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } +#define regulator_desc_s2mps11_buck9 { \ + .name = "BUCK9", \ + .id = S2MPS11_BUCK9, \ + .ops = &s2mps11_buck_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = MIN_3000_MV, \ + .uV_step = STEP_25_MV, \ + .n_voltages = S2MPS11_BUCK9_N_VOLTAGES, \ + .ramp_delay = S2MPS11_RAMP_DELAY, \ + .vsel_reg = S2MPS11_REG_B9CTRL2, \ + .vsel_mask = S2MPS11_BUCK9_VSEL_MASK, \ + .enable_reg = S2MPS11_REG_B9CTRL1, \ + .enable_mask = S2MPS11_ENABLE_MASK \ +} + static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_ldo(1, STEP_25_MV), regulator_desc_s2mps11_ldo(2, STEP_50_MV), @@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_buck1_4(3), regulator_desc_s2mps11_buck1_4(4), regulator_desc_s2mps11_buck5, - regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV), - regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV), + regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck9, + regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV), }; static struct regulator_ops s2mps14_reg_ops; diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h index b288965e8101d..2c14eeca46f03 100644 --- a/include/linux/mfd/samsung/s2mps11.h +++ b/include/linux/mfd/samsung/s2mps11.h @@ -173,10 +173,12 @@ enum s2mps11_regulators { #define S2MPS11_LDO_VSEL_MASK 0x3F #define S2MPS11_BUCK_VSEL_MASK 0xFF +#define S2MPS11_BUCK9_VSEL_MASK 0x1F #define S2MPS11_ENABLE_MASK (0x03 << S2MPS11_ENABLE_SHIFT) #define S2MPS11_ENABLE_SHIFT 0x06 #define S2MPS11_LDO_N_VOLTAGES (S2MPS11_LDO_VSEL_MASK + 1) #define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1) +#define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1) #define S2MPS11_RAMP_DELAY 25000 /* uV/us */ #define S2MPS11_CTRL1_PWRHOLD_MASK BIT(4) -- GitLab From 02f037d641dc6672be5cfe7875a48ab99b95b154 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:57 -0600 Subject: [PATCH 005/705] x86/mm/pat: Add support of non-default PAT MSR setting In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled")' ... PAT needs to support a case that PAT MSR is initialized with a non-default value. When pat_init() is called and PAT is disabled, it initializes the PAT table with the BIOS default value. Xen, however, sets PAT MSR with a non-default value to enable WC. This causes inconsistency between the PAT table and PAT MSR when PAT is set to disable on Xen. Change pat_init() to handle the PAT disable cases properly. Add init_cache_modes() to handle two cases when PAT is set to disable. 1. CPU supports PAT: Set PAT table to be consistent with PAT MSR. 2. CPU does not support PAT: Set PAT table to be consistent with PWT and PCD bits in a PTE. Note, __init_cache_modes(), renamed from pat_init_cache_modes(), will be changed to a static function in a later patch. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 2 +- arch/x86/mm/pat.c | 73 +++++++++++++++++++++++++++----------- arch/x86/xen/enlighten.c | 2 +- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index ca6c228d5e628..97ea55bc2b54e 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -6,7 +6,7 @@ bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(u64); +void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index faec01e7a17d2..b4663885308f2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -181,7 +181,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -207,9 +207,6 @@ static void pat_bsp_init(u64 pat) return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -218,15 +215,11 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - if (!cpu_has_pat) { /* * If this happens we are on a secondary CPU, but switched to @@ -238,18 +231,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -266,10 +273,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c7d9ddb..c469a7c7c3094 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1623,7 +1623,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * configuration. */ rdmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(pat); + __init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ -- GitLab From 224bb1e5d67ba0f2872c98002d6a6f991ac6fd4a Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:58 -0600 Subject: [PATCH 006/705] x86/mm/pat: Add pat_disable() interface In preparation for fixing a regression caused by: 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled") ... PAT needs to provide an interface that prevents the OS from initializing the PAT MSR. PAT MSR initialization must be done on all CPUs using the specific sequence of operations defined in the Intel SDM. This requires MTRRs to be enabled since pat_init() is called as part of MTRR init from mtrr_rendezvous_handler(). Make pat_disable() as the interface that prevents the OS from initializing the PAT MSR. MTRR will call this interface when it cannot provide the SDM-defined sequence to initialize PAT. This also assures that pat_disable() called from pat_bsp_init() will set the PAT table properly when CPU does not support PAT. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 1 + arch/x86/mm/pat.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 97ea55bc2b54e..0ad356c066eff 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,6 +5,7 @@ #include bool pat_enabled(void); +void pat_disable(const char *reason); extern void pat_init(void); void __init_cache_modes(u64); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b4663885308f2..1cc1d37f1de7d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -40,11 +40,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) -- GitLab From d63dcf49cf5ae5605f4d14229e3888e104f294b1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:41:59 -0600 Subject: [PATCH 007/705] x86/mm/pat: Replace cpu_has_pat with boot_cpu_has() Borislav Petkov suggested: > Please use on init paths boot_cpu_has(X86_FEATURE_PAT) and on fast > paths static_cpu_has(X86_FEATURE_PAT). No more of that cpu_has_XXX > ugliness. Replace the use of cpu_has_pat on init paths with boot_cpu_has(). Suggested-by: Borislav Petkov Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Robert Elliott Cc: Toshi Kani Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-4-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1cc1d37f1de7d..59ec038b98339 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -213,7 +213,7 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } @@ -231,7 +231,7 @@ static void pat_bsp_init(u64 pat) static void pat_ap_init(u64 pat) { - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. -- GitLab From edfe63ec97ed8d4496225f7ba54c9ce4207c5431 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:00 -0600 Subject: [PATCH 008/705] x86/mtrr: Fix Xorg crashes in Qemu sessions A Xorg failure on qemu32 was reported as a regression [1] caused by commit 9cd25aac1f44 ("x86/mm/pat: Emulate PAT when it is disabled"). This patch fixes the Xorg crash. Negative effects of this regression were the following two failures [2] in Xorg on QEMU with QEMU CPU model "qemu32" (-cpu qemu32), which were triggered by the fact that its virtual CPU does not support MTRRs. #1. copy_process() failed in the check in reserve_pfn_range() copy_process copy_mm dup_mm dup_mmap copy_page_range track_pfn_copy reserve_pfn_range A WC map request was tracked as WC in memtype, which set a PTE as UC (pgprot) per __cachemode2pte_tbl[]. This led to this error in reserve_pfn_range() called from track_pfn_copy(), which obtained a pgprot from a PTE. It converts pgprot to page_cache_mode, which does not necessarily result in the original page_cache_mode since __cachemode2pte_tbl[] redirects multiple types to UC. #2. error path in copy_process() then hit WARN_ON_ONCE in untrack_pfn(). x86/PAT: Xorg:509 map pfn expected mapping type uncached- minus for [mem 0xfd000000-0xfdffffff], got write-combining Call Trace: dump_stack warn_slowpath_common ? untrack_pfn ? untrack_pfn warn_slowpath_null untrack_pfn ? __kunmap_atomic unmap_single_vma ? pagevec_move_tail_fn unmap_vmas exit_mmap mmput copy_process.part.47 _do_fork SyS_clone do_syscall_32_irqs_on entry_INT80_32 These negative effects are caused by two separate bugs, but they can be addressed in separate patches. Fixing the pat_init() issue described below addresses the root cause, and avoids Xorg to hit these cases. When the CPU does not support MTRRs, MTRR does not call pat_init(), which leaves PAT enabled without initializing PAT. This pat_init() issue is a long-standing issue, but manifested as issue #1 (and then hit issue #2) with the above-mentioned commit because the memtype now tracks cache attribute with 'page_cache_mode'. This pat_init() issue existed before the commit, but we used pgprot in memtype. Hence, we did not have issue #1 before. But WC request resulted in WT in effect because WC pgrot is actually WT when PAT is not initialized. This is not how it was designed to work. When PAT is set to disable properly, WC is converted to UC. The use of WT can result in a system crash if the target range does not support WT. Fortunately, nobody ran into such issue before. To fix this pat_init() issue, PAT code has been enhanced to provide pat_disable() interface. Call this interface when MTRRs are disabled. By setting PAT to disable properly, PAT bypasses the memtype check, and avoids issue #1. [1]: https://lkml.org/lkml/2016/3/3/828 [2]: https://lkml.org/lkml/2016/3/4/775 Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-5-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 6 +++++- arch/x86/kernel/cpu/mtrr/main.c | 10 +++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index b94f6f64e23d0..dbff1456d2152 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,6 +24,7 @@ #define _ASM_X86_MTRR_H #include +#include /* @@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline void mtrr_bp_init(void) +{ + pat_disable("MTRRs disabled, skipping PAT initialization too."); +} #define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 10f8d47962407..8b1947ba3e62e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -759,8 +759,16 @@ void __init mtrr_bp_init(void) } } - if (!mtrr_enabled()) + if (!mtrr_enabled()) { pr_info("MTRR: Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } } void mtrr_ap_init(void) -- GitLab From ad025a73f0e9344ac73ffe1b74c184033e08e7d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:01 -0600 Subject: [PATCH 009/705] x86/mtrr: Fix PAT init handling when MTRR is disabled get_mtrr_state() calls pat_init() on BSP even if MTRR is disabled. This results in calling pat_init() on BSP only since APs do not call pat_init() when MTRR is disabled. This inconsistency between BSP and APs leads to undefined behavior. Make BSP's calling condition to pat_init() consistent with AP's, mtrr_ap_init() and mtrr_aps_init(). Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-6-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 24 ++++++++++++++---------- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ arch/x86/kernel/cpu/mtrr/mtrr.h | 1 + 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 19f57360dfd25..8d7a29ed93771 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -444,11 +444,24 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; - unsigned long flags; unsigned lo, dummy; unsigned int i; @@ -481,15 +494,6 @@ bool __init get_mtrr_state(void) mtrr_state_set = 1; - /* PAT setup for BP. We need to go through sync steps here */ - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); - return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8b1947ba3e62e..7d393ecdeee69 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -752,6 +752,9 @@ void __init mtrr_bp_init(void) /* BIOS may override */ __mtrr_enabled = get_mtrr_state(); + if (mtrr_enabled()) + mtrr_bp_pat_init(); + if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 951884dcc4335..6c7ced07d16d1 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); -- GitLab From 88ba281108ed0c25c9d292b48bd3f272fcb90dd0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:02 -0600 Subject: [PATCH 010/705] x86/xen, pat: Remove PAT table init code from Xen Xen supports PAT without MTRRs for its guests. In order to enable WC attribute, it was necessary for xen_start_kernel() to call pat_init_cache_modes() to update PAT table before starting guest kernel. Now that the kernel initializes PAT table to the BIOS handoff state when MTRR is disabled, this Xen-specific PAT init code is no longer necessary. Delete it from xen_start_kernel(). Also change __init_cache_modes() to a static function since PAT table should not be tweaked by other modules. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Acked-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-7-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 1 - arch/x86/mm/pat.c | 2 +- arch/x86/xen/enlighten.c | 9 --------- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0ad356c066eff..0b1ff4c1c14e7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -7,7 +7,6 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); -void __init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 59ec038b98339..c4c3ddcc9069d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -192,7 +192,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void __init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c469a7c7c3094..d8cca75e3b3e2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,7 +75,6 @@ #include #include #include -#include #include #ifdef CONFIG_ACPI @@ -1511,7 +1510,6 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; - u64 pat; int rc; if (!xen_start_info) @@ -1618,13 +1616,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* - * Modify the cache mode translation tables to match Xen's PAT - * configuration. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - __init_cache_modes(pat); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 -- GitLab From b6350c21cfe8aa9d65e189509a23c0ea4b8362c2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 23 Mar 2016 15:42:03 -0600 Subject: [PATCH 011/705] x86/pat: Document the PAT initialization sequence Update PAT documentation to describe how PAT is initialized under various configurations. Signed-off-by: Toshi Kani Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Toshi Kani Cc: elliott@hpe.com Cc: konrad.wilk@oracle.com Cc: paul.gortmaker@windriver.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1458769323-24491-8-git-send-email-toshi.kani@hpe.com Signed-off-by: Ingo Molnar --- Documentation/x86/pat.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 54944c71b819b..2a4ee6302122f 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with "debugpat" boot parameter. With this parameter, various debug messages are printed to dmesg log. +PAT Initialization +------------------ + +The following table describes how PAT is initialized under various +configurations. The PAT MSR must be updated by Linux in order to support WC +and WT attributes. Otherwise, the PAT MSR has the value programmed in it +by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests. + + MTRR PAT Call Sequence PAT State PAT MSR + ========================================================= + E E MTRR -> PAT init Enabled OS + E D MTRR -> PAT init Disabled - + D E MTRR -> PAT disable Disabled BIOS + D D MTRR -> PAT disable Disabled - + - np/E PAT -> PAT disable Disabled BIOS + - np/D PAT -> PAT disable Disabled - + E !P/E MTRR -> PAT init Disabled BIOS + D !P/E MTRR -> PAT disable Disabled BIOS + !M !P/E MTRR stub -> PAT disable Disabled BIOS + + Legend + ------------------------------------------------ + E Feature enabled in CPU + D Feature disabled/unsupported in CPU + np "nopat" boot option specified + !P CONFIG_X86_PAT option unset + !M CONFIG_MTRR option unset + Enabled PAT state set to enabled + Disabled PAT state set to disabled + OS PAT initializes PAT MSR with OS setting + BIOS PAT keeps PAT MSR with BIOS setting + -- GitLab From b3edfda4382ffaef5e5c1cffb25a33b3b9ef4546 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 16 Mar 2016 13:19:29 +0100 Subject: [PATCH 012/705] x86/cpu: Do the feature test first in enable_sep_cpu() ... before assigning local vars. Kill out label too and simplify. No functionality change. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458130769-24963-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8394b3d1f94fc..7fea4079d1020 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1076,12 +1076,12 @@ void enable_sep_cpu(void) struct tss_struct *tss; int cpu; + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + cpu = get_cpu(); tss = &per_cpu(cpu_tss, cpu); - if (!boot_cpu_has(X86_FEATURE_SEP)) - goto out; - /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * see the big comment in struct x86_hw_tss's definition. @@ -1096,7 +1096,6 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); -out: put_cpu(); } #endif -- GitLab From ad16511b0e404652331a5350c522d0824f8209de Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 24 Mar 2016 13:52:16 +0100 Subject: [PATCH 013/705] perf mem: Add -U/-K (--all-user/--all-kernel) options Add -U/-K (--all-user/--all-kernel) options to use the perf record --all-user/--all-kernel options. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1458823940-24583-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-mem.txt | 8 ++++++++ tools/perf/builtin-mem.c | 11 ++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 43310d8661fed..1d6092c460dd0 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -48,6 +48,14 @@ OPTIONS option can be passed in record mode. It will be interpreted the same way as perf record. +-K:: +--all-kernel:: + Configure all used events to run in kernel space. + +-U:: +--all-user:: + Configure all used events to run in user space. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 85db3be4b3cb6..1dc140c5481d6 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -62,19 +62,22 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) int rec_argc, i = 0, j; const char **rec_argv; int ret; + bool all_user = false, all_kernel = false; struct option options[] = { OPT_CALLBACK('e', "event", &mem, "event", "event selector. use 'perf mem record -e list' to list available events", parse_record_events), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), + OPT_BOOLEAN('U', "--all-user", &all_user, "collect only user level data"), + OPT_BOOLEAN('K', "--all-kernel", &all_kernel, "collect only kernel level data"), OPT_END() }; argc = parse_options(argc, argv, options, record_mem_usage, PARSE_OPT_STOP_AT_NON_OPTION); - rec_argc = argc + 7; /* max number of arguments */ + rec_argc = argc + 9; /* max number of arguments */ rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) return -1; @@ -103,6 +106,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) rec_argv[i++] = perf_mem_events__name(j); }; + if (all_user) + rec_argv[i++] = "--all-user"; + + if (all_kernel) + rec_argv[i++] = "--all-kernel"; + for (j = 0; j < argc; j++, i++) rec_argv[i] = argv[j]; -- GitLab From 592dac6f35cf222a7687d4ff1ea7df0e6ef722e0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 24 Mar 2016 13:52:17 +0100 Subject: [PATCH 014/705] perf tools: Make hists__collapse_insert_entry static No need to export hists__collapse_insert_entry function. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1458823940-24583-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 5 +++-- tools/perf/util/hist.h | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 31c4641fe5ff0..3d34c57dfbe26 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1295,8 +1295,9 @@ static int hists__hierarchy_insert_entry(struct hists *hists, return ret; } -int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root, - struct hist_entry *he) +static int hists__collapse_insert_entry(struct hists *hists, + struct rb_root *root, + struct hist_entry *he) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index bec0cd660fbd6..588596561cb31 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -199,8 +199,6 @@ int hists__init(void); int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list); struct rb_root *hists__get_rotate_entries_in(struct hists *hists); -int hists__collapse_insert_entry(struct hists *hists, - struct rb_root *root, struct hist_entry *he); struct perf_hpp { char *buf; -- GitLab From e0be62cc0325d65e1b7ae55d23e3d224638c20a6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 24 Mar 2016 13:52:19 +0100 Subject: [PATCH 015/705] perf tools: Make -f/--force option documentation consistent across tools Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1458823940-24583-6-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-annotate.txt | 2 +- tools/perf/Documentation/perf-diff.txt | 2 +- tools/perf/Documentation/perf-report.txt | 2 +- tools/perf/Documentation/perf-script.txt | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index e9cd39a92dc22..778f54d4d0bd7 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -33,7 +33,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. -v:: --verbose:: diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index d1deb573877fe..3e9490b9c5334 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -75,7 +75,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. --symfs=:: Look for files with symbols relative to this directory. diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 12113992ac9d0..496d42cdf02b1 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -285,7 +285,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. --symfs=:: Look for files with symbols relative to this directory. diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 382ddfb45d1db..22ef3933342ad 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -262,6 +262,10 @@ include::itrace.txt[] --ns:: Use 9 decimal places when displaying time (i.e. show the nanoseconds) +-f:: +--force:: + Don't do ownership validation. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], -- GitLab From b31d660df37c1701fd18d526faeb9a86f0fc7dd2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 24 Mar 2016 13:52:20 +0100 Subject: [PATCH 016/705] perf tests: Add test to check for event times This test creates software event 'cpu-clock' attaches it in several ways and checks that enabled and running times match. Committer notes: Testing it: [acme@jouet linux]$ perf test -v times 44: Test events times : --- start --- test child forked, pid 27170 attaching to spawned child, enable on exec OK : ena 307328, run 307328 attaching to current thread as enabled OK : ena 7826, run 7826 attaching to current thread as disabled OK : ena 738, run 738 attaching to CPU 0 as enabled SKIP : not enough rights attaching to CPU 0 as enabled SKIP : not enough rights test child finished with -2 ---- end ---- Test events times: Skip [acme@jouet linux]$ [root@jouet ~]# perf test times 44: Test events times : Ok [root@jouet ~]# perf test -v times 44: Test events times : --- start --- test child forked, pid 27306 attaching to spawned child, enable on exec OK : ena 479290, run 479290 attaching to current thread as enabled OK : ena 11356, run 11356 attaching to current thread as disabled OK : ena 987, run 987 attaching to CPU 0 as enabled OK : ena 3717, run 3717 attaching to CPU 0 as enabled OK : ena 2323, run 2323 test child finished with 0 ---- end ---- Test events times: Ok [root@jouet ~]# Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1458823940-24583-7-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/Build | 1 + tools/perf/tests/builtin-test.c | 4 + tools/perf/tests/event-times.c | 236 ++++++++++++++++++++++++++++++++ tools/perf/tests/tests.h | 1 + 4 files changed, 242 insertions(+) create mode 100644 tools/perf/tests/event-times.c diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 1ba628ed049ad..449fe97a555f7 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -37,6 +37,7 @@ perf-y += topology.o perf-y += cpumap.o perf-y += stat.o perf-y += event_update.o +perf-y += event-times.o $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build $(call rule_mkdir) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index f2b1dcac45d30..93c467015e711 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -203,6 +203,10 @@ static struct test generic_tests[] = { .desc = "Test attr update synthesize", .func = test__event_update, }, + { + .desc = "Test events times", + .func = test__event_times, + }, { .func = NULL, }, diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c new file mode 100644 index 0000000000000..95fb744f6628b --- /dev/null +++ b/tools/perf/tests/event-times.c @@ -0,0 +1,236 @@ +#include +#include +#include "tests.h" +#include "evlist.h" +#include "evsel.h" +#include "util.h" +#include "debug.h" +#include "thread_map.h" +#include "target.h" + +static int attach__enable_on_exec(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct target target = { + .uid = UINT_MAX, + }; + const char *argv[] = { "true", NULL, }; + char sbuf[STRERR_BUFSIZE]; + int err; + + pr_debug("attaching to spawned child, enable on exec\n"); + + err = perf_evlist__create_maps(evlist, &target); + if (err < 0) { + pr_debug("Not enough memory to create thread/cpu maps\n"); + return err; + } + + err = perf_evlist__prepare_workload(evlist, &target, argv, false, NULL); + if (err < 0) { + pr_debug("Couldn't run the workload!\n"); + return err; + } + + evsel->attr.enable_on_exec = 1; + + err = perf_evlist__open(evlist); + if (err < 0) { + pr_debug("perf_evlist__open: %s\n", + strerror_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + return perf_evlist__start_workload(evlist) == 1 ? TEST_OK : TEST_FAIL; +} + +static int detach__enable_on_exec(struct perf_evlist *evlist) +{ + waitpid(evlist->workload.pid, NULL, 0); + return 0; +} + +static int attach__current_disabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct thread_map *threads; + int err; + + pr_debug("attaching to current thread as disabled\n"); + + threads = thread_map__new(-1, getpid(), UINT_MAX); + if (threads == NULL) { + pr_debug("thread_map__new\n"); + return -1; + } + + evsel->attr.disabled = 1; + + err = perf_evsel__open_per_thread(evsel, threads); + if (err) { + pr_debug("Failed to open event cpu-clock:u\n"); + return err; + } + + thread_map__put(threads); + return perf_evsel__enable(evsel) == 0 ? TEST_OK : TEST_FAIL; +} + +static int attach__current_enabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct thread_map *threads; + int err; + + pr_debug("attaching to current thread as enabled\n"); + + threads = thread_map__new(-1, getpid(), UINT_MAX); + if (threads == NULL) { + pr_debug("failed to call thread_map__new\n"); + return -1; + } + + err = perf_evsel__open_per_thread(evsel, threads); + + thread_map__put(threads); + return err == 0 ? TEST_OK : TEST_FAIL; +} + +static int detach__disable(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + + return perf_evsel__enable(evsel); +} + +static int attach__cpu_disabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct cpu_map *cpus; + int err; + + pr_debug("attaching to CPU 0 as enabled\n"); + + cpus = cpu_map__new("0"); + if (cpus == NULL) { + pr_debug("failed to call cpu_map__new\n"); + return -1; + } + + evsel->attr.disabled = 1; + + err = perf_evsel__open_per_cpu(evsel, cpus); + if (err) { + if (err == -EACCES) + return TEST_SKIP; + + pr_debug("Failed to open event cpu-clock:u\n"); + return err; + } + + cpu_map__put(cpus); + return perf_evsel__enable(evsel); +} + +static int attach__cpu_enabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct cpu_map *cpus; + int err; + + pr_debug("attaching to CPU 0 as enabled\n"); + + cpus = cpu_map__new("0"); + if (cpus == NULL) { + pr_debug("failed to call cpu_map__new\n"); + return -1; + } + + err = perf_evsel__open_per_cpu(evsel, cpus); + if (err == -EACCES) + return TEST_SKIP; + + cpu_map__put(cpus); + return err ? TEST_FAIL : TEST_OK; +} + +static int test_times(int (attach)(struct perf_evlist *), + int (detach)(struct perf_evlist *)) +{ + struct perf_counts_values count; + struct perf_evlist *evlist = NULL; + struct perf_evsel *evsel; + int err = -1, i; + + evlist = perf_evlist__new(); + if (!evlist) { + pr_debug("failed to create event list\n"); + goto out_err; + } + + err = parse_events(evlist, "cpu-clock:u", NULL); + if (err) { + pr_debug("failed to parse event cpu-clock:u\n"); + goto out_err; + } + + evsel = perf_evlist__last(evlist); + evsel->attr.read_format |= + PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + + err = attach(evlist); + if (err == TEST_SKIP) { + pr_debug(" SKIP : not enough rights\n"); + return err; + } + + TEST_ASSERT_VAL("failed to attach", !err); + + for (i = 0; i < 100000000; i++) { } + + TEST_ASSERT_VAL("failed to detach", !detach(evlist)); + + perf_evsel__read(evsel, 0, 0, &count); + + err = !(count.ena == count.run); + + pr_debug(" %s: ena %" PRIu64", run %" PRIu64"\n", + !err ? "OK " : "FAILED", + count.ena, count.run); + +out_err: + if (evlist) + perf_evlist__delete(evlist); + return !err ? TEST_OK : TEST_FAIL; +} + +/* + * This test creates software event 'cpu-clock' + * attaches it in several ways (explained below) + * and checks that enabled and running times + * match. + */ +int test__event_times(int subtest __maybe_unused) +{ + int err, ret = 0; + +#define _T(attach, detach) \ + err = test_times(attach, detach); \ + if (err && (ret == TEST_OK || ret == TEST_SKIP)) \ + ret = err; + + /* attach on newly spawned process after exec */ + _T(attach__enable_on_exec, detach__enable_on_exec) + /* attach on current process as enabled */ + _T(attach__current_enabled, detach__disable) + /* attach on current process as disabled */ + _T(attach__current_disabled, detach__disable) + /* attach on cpu as disabled */ + _T(attach__cpu_disabled, detach__disable) + /* attach on cpu as enabled */ + _T(attach__cpu_enabled, detach__disable) + +#undef _T + return ret; +} diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 82b2b5e6ba7c7..0fc946989cf0f 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -85,6 +85,7 @@ int test__synthesize_stat_config(int subtest); int test__synthesize_stat(int subtest); int test__synthesize_stat_round(int subtest); int test__event_update(int subtest); +int test__event_times(int subtest); #if defined(__arm__) || defined(__aarch64__) #ifdef HAVE_DWARF_UNWIND_SUPPORT -- GitLab From 58cb9d650be45100bf53ddf9e00351391de3d735 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Mon, 28 Mar 2016 02:22:18 +0900 Subject: [PATCH 017/705] perf config: Remove duplicated set_buildid_dir calls Signed-off-by: Taeung Song Acked-by: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1459099340-16911-1-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index aaee0a7827477..7b2df2b46525f 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -549,6 +549,7 @@ int main(int argc, const char **argv) srandom(time(NULL)); perf_config(perf_default_config, NULL); + set_buildid_dir(NULL); /* get debugfs/tracefs mount point from /proc/mounts */ tracing_path_mount(); @@ -572,7 +573,6 @@ int main(int argc, const char **argv) } if (!prefixcmp(cmd, "trace")) { #ifdef HAVE_LIBAUDIT_SUPPORT - set_buildid_dir(NULL); setup_path(); argv[0] = "trace"; return cmd_trace(argc, argv, NULL); @@ -587,7 +587,6 @@ int main(int argc, const char **argv) argc--; handle_options(&argv, &argc, NULL); commit_pager_choice(); - set_buildid_dir(NULL); if (argc > 0) { if (!prefixcmp(argv[0], "--")) -- GitLab From 9cb5987c822714352e3eb46806fc260b3cb4ff0d Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Mon, 28 Mar 2016 02:22:19 +0900 Subject: [PATCH 018/705] perf config: Rework buildid_dir_command_config to perf_buildid_config To avoid repeated calling perf_config() remove buildid_dir_command_config() and add new perf_buildid_config into perf_default_config. Because perf_config() is already called with perf_default_config at main(). Signed-off-by: Taeung Song Acked-by: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1459099340-16911-2-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 50 +++++++++++++++------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 4e727635476ea..2dd78f4c97a02 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -377,6 +377,21 @@ const char *perf_config_dirname(const char *name, const char *value) return value; } +static int perf_buildid_config(const char *var, const char *value) +{ + /* same dir for all commands */ + if (!strcmp(var, "buildid.dir")) { + const char *dirname = perf_config_dirname(var, value); + + if (!dirname) + return -1; + strncpy(buildid_dir, dirname, MAXPATHLEN-1); + buildid_dir[MAXPATHLEN-1] = '\0'; + } + + return 0; +} + static int perf_default_core_config(const char *var __maybe_unused, const char *value __maybe_unused) { @@ -412,6 +427,9 @@ int perf_default_config(const char *var, const char *value, if (!prefixcmp(var, "llvm.")) return perf_llvm_config(var, value); + if (!prefixcmp(var, "buildid.")) + return perf_buildid_config(var, value); + /* Add other config variables here. */ return 0; } @@ -515,43 +533,11 @@ int config_error_nonbool(const char *var) return error("Missing value for '%s'", var); } -struct buildid_dir_config { - char *dir; -}; - -static int buildid_dir_command_config(const char *var, const char *value, - void *data) -{ - struct buildid_dir_config *c = data; - const char *v; - - /* same dir for all commands */ - if (!strcmp(var, "buildid.dir")) { - v = perf_config_dirname(var, value); - if (!v) - return -1; - strncpy(c->dir, v, MAXPATHLEN-1); - c->dir[MAXPATHLEN-1] = '\0'; - } - return 0; -} - -static void check_buildid_dir_config(void) -{ - struct buildid_dir_config c; - c.dir = buildid_dir; - perf_config(buildid_dir_command_config, &c); -} - void set_buildid_dir(const char *dir) { if (dir) scnprintf(buildid_dir, MAXPATHLEN-1, "%s", dir); - /* try config file */ - if (buildid_dir[0] == '\0') - check_buildid_dir_config(); - /* default to $HOME/.debug */ if (buildid_dir[0] == '\0') { char *v = getenv("HOME"); -- GitLab From 37194f443a5a7157866ba68b04827e111100167b Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Mon, 28 Mar 2016 02:22:20 +0900 Subject: [PATCH 019/705] perf config: Rename 'v' to 'home' in set_buildid_dir() Change the variable name 'v' to 'home' to make it more readable. Signed-off-by: Taeung Song Acked-by: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1459099340-16911-3-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 2dd78f4c97a02..5c20d783423be 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -540,10 +540,11 @@ void set_buildid_dir(const char *dir) /* default to $HOME/.debug */ if (buildid_dir[0] == '\0') { - char *v = getenv("HOME"); - if (v) { + char *home = getenv("HOME"); + + if (home) { snprintf(buildid_dir, MAXPATHLEN-1, "%s/%s", - v, DEBUG_CACHE_DIR); + home, DEBUG_CACHE_DIR); } else { strncpy(buildid_dir, DEBUG_CACHE_DIR, MAXPATHLEN-1); } -- GitLab From f7380c12ec6cfd69f274ba6181cd01c764f877bb Mon Sep 17 00:00:00 2001 From: Dima Kogan Date: Tue, 29 Mar 2016 12:47:53 -0300 Subject: [PATCH 020/705] perf script perl: Perl scripts now get a backtrace, like the python ones We have some infrastructure to use perl or python to analyze logs generated by perf. Prior to this patch, only the python tools had access to backtrace information. This patch makes this information available to perl scripts as well. Example: Let's look at malloc() calls made by the seq utility. First we create a probe point: $ perf probe -x /lib/x86_64-linux-gnu/libc.so.6 malloc Added new events: ... Now we run seq, while monitoring malloc() calls with perf $ perf record --call-graph=dwarf -e probe_libc:malloc seq 5 1 2 3 4 5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.064 MB perf.data (6 samples) ] We can use perf to look at its log to see the malloc calls and the backtrace $ perf script seq 14195 [000] 1927993.748254: probe_libc:malloc: (7f9ff8edd320) bytes=0x22 7f9ff8edd320 malloc (/lib/x86_64-linux-gnu/libc-2.22.so) 7f9ff8e8eab0 set_binding_values.part.0 (/lib/x86_64-linux-gnu/libc-2.22.so) 7f9ff8e8eda1 __bindtextdomain (/lib/x86_64-linux-gnu/libc-2.22.so) 401b22 main (/usr/bin/seq) 7f9ff8e82610 __libc_start_main (/lib/x86_64-linux-gnu/libc-2.22.so) 402799 _start (/usr/bin/seq) ... We can also use the scripting facilities. We create a skeleton perl script that simply prints out the events $ perf script -g perl generated Perl script: perf-script.pl We can then use this script to see the malloc() calls with a backtrace. Prior to this patch, the backtrace was not available to the perl scripts. $ perf script -s perf-script.pl probe_libc::malloc 0 1927993.748254260 14195 seq __probe_ip=140325052863264, bytes=34 [7f9ff8edd320] malloc [7f9ff8e8eab0] set_binding_values.part.0 [7f9ff8e8eda1] __bindtextdomain [401b22] main [7f9ff8e82610] __libc_start_main [402799] _start ... Tested-by: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/87mvphzld0.fsf@secretsauce.net Signed-off-by: Dima Kogan --- .../util/scripting-engines/trace-event-perl.c | 114 ++++++++++++++++-- 1 file changed, 106 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b3aabc0d4eb00..1d160855cda92 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -31,6 +31,8 @@ #include #include "../../perf.h" +#include "../callchain.h" +#include "../machine.h" #include "../thread.h" #include "../event.h" #include "../trace-event.h" @@ -248,10 +250,78 @@ static void define_event_symbols(struct event_format *event, define_event_symbols(event, ev_name, args->next); } +static SV *perl_process_callchain(struct perf_sample *sample, + struct perf_evsel *evsel, + struct addr_location *al) +{ + AV *list; + + list = newAV(); + if (!list) + goto exit; + + if (!symbol_conf.use_callchain || !sample->callchain) + goto exit; + + if (thread__resolve_callchain(al->thread, evsel, + sample, NULL, NULL, + PERF_MAX_STACK_DEPTH) != 0) { + pr_err("Failed to resolve callchain. Skipping\n"); + goto exit; + } + callchain_cursor_commit(&callchain_cursor); + + + while (1) { + HV *elem; + struct callchain_cursor_node *node; + node = callchain_cursor_current(&callchain_cursor); + if (!node) + break; + + elem = newHV(); + if (!elem) + goto exit; + + hv_stores(elem, "ip", newSVuv(node->ip)); + + if (node->sym) { + HV *sym = newHV(); + if (!sym) + goto exit; + hv_stores(sym, "start", newSVuv(node->sym->start)); + hv_stores(sym, "end", newSVuv(node->sym->end)); + hv_stores(sym, "binding", newSVuv(node->sym->binding)); + hv_stores(sym, "name", newSVpvn(node->sym->name, + node->sym->namelen)); + hv_stores(elem, "sym", newRV_noinc((SV*)sym)); + } + + if (node->map) { + struct map *map = node->map; + const char *dsoname = "[unknown]"; + if (map && map->dso && (map->dso->name || map->dso->long_name)) { + if (symbol_conf.show_kernel_path && map->dso->long_name) + dsoname = map->dso->long_name; + else if (map->dso->name) + dsoname = map->dso->name; + } + hv_stores(elem, "dso", newSVpv(dsoname,0)); + } + + callchain_cursor_advance(&callchain_cursor); + av_push(list, newRV_noinc((SV*)elem)); + } + +exit: + return newRV_noinc((SV*)list); +} + static void perl_process_tracepoint(struct perf_sample *sample, struct perf_evsel *evsel, - struct thread *thread) + struct addr_location *al) { + struct thread *thread = al->thread; struct event_format *event = evsel->tp_format; struct format_field *field; static char handler[256]; @@ -295,6 +365,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, XPUSHs(sv_2mortal(newSVuv(ns))); XPUSHs(sv_2mortal(newSViv(pid))); XPUSHs(sv_2mortal(newSVpv(comm, 0))); + XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al))); /* common fields other than pid can be accessed via xsub fns */ @@ -329,6 +400,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, XPUSHs(sv_2mortal(newSVuv(nsecs))); XPUSHs(sv_2mortal(newSViv(pid))); XPUSHs(sv_2mortal(newSVpv(comm, 0))); + XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al))); call_pv("main::trace_unhandled", G_SCALAR); } SPAGAIN; @@ -366,7 +438,7 @@ static void perl_process_event(union perf_event *event, struct perf_evsel *evsel, struct addr_location *al) { - perl_process_tracepoint(sample, evsel, al->thread); + perl_process_tracepoint(sample, evsel, al); perl_process_event_generic(event, sample, evsel); } @@ -490,7 +562,27 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "use Perf::Trace::Util;\n\n"); fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n"); - fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n"); + fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n"); + + + fprintf(ofp, "\n\ +sub print_backtrace\n\ +{\n\ + my $callchain = shift;\n\ + for my $node (@$callchain)\n\ + {\n\ + if(exists $node->{sym})\n\ + {\n\ + printf( \"\\t[\\%%x] \\%%s\\n\", $node->{ip}, $node->{sym}{name});\n\ + }\n\ + else\n\ + {\n\ + printf( \"\\t[\\%%x]\\n\", $node{ip});\n\ + }\n\ + }\n\ +}\n\n\ +"); + while ((event = trace_find_next_event(pevent, event))) { fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name); @@ -502,7 +594,8 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "$common_secs, "); fprintf(ofp, "$common_nsecs,\n"); fprintf(ofp, "\t $common_pid, "); - fprintf(ofp, "$common_comm,\n\t "); + fprintf(ofp, "$common_comm, "); + fprintf(ofp, "$common_callchain,\n\t "); not_first = 0; count = 0; @@ -519,7 +612,7 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "\tprint_header($event_name, $common_cpu, " "$common_secs, $common_nsecs,\n\t " - "$common_pid, $common_comm);\n\n"); + "$common_pid, $common_comm, $common_callchain);\n\n"); fprintf(ofp, "\tprintf(\""); @@ -581,17 +674,22 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "$%s", f->name); } - fprintf(ofp, ");\n"); + fprintf(ofp, ");\n\n"); + + fprintf(ofp, "\tprint_backtrace($common_callchain);\n"); + fprintf(ofp, "}\n\n"); } fprintf(ofp, "sub trace_unhandled\n{\n\tmy ($event_name, $context, " "$common_cpu, $common_secs, $common_nsecs,\n\t " - "$common_pid, $common_comm) = @_;\n\n"); + "$common_pid, $common_comm, $common_callchain) = @_;\n\n"); fprintf(ofp, "\tprint_header($event_name, $common_cpu, " "$common_secs, $common_nsecs,\n\t $common_pid, " - "$common_comm);\n}\n\n"); + "$common_comm, $common_callchain);\n"); + fprintf(ofp, "\tprint_backtrace($common_callchain);\n"); + fprintf(ofp, "}\n\n"); fprintf(ofp, "sub print_header\n{\n" "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n" -- GitLab From d1706b39f0af6901ab2a5e2ebb210b53c1a5bdc7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 28 Mar 2016 10:45:38 -0700 Subject: [PATCH 021/705] perf tools: Add support for skipping itrace instructions When using 'perf script' to look at PT traces it is often useful to ignore the initialization code at the beginning. On larger traces which may have many millions of instructions in initialization code doing that in a pipeline can be very slow, with perf script spending a lot of CPU time calling printf and writing data. This patch adds an extension to the --itrace argument that skips 'n' events (instructions, branches or transactions) at the beginning. This is much more efficient. v2: Add support for BTS (Adrian Hunter) Document in itrace.txt Fix branch check Check transactions and instructions too Committer note: To test intel_pt one needs to make sure VT-x isn't active, i.e. stopping KVM guests on the test machine, as described by Andi Kleen at http://lkml.kernel.org/r/20160301234953.GD23621@tassilo.jf.intel.com Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Jiri Olsa Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1459187142-20035-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/intel-pt.txt | 7 +++++++ tools/perf/Documentation/itrace.txt | 8 ++++++++ tools/perf/util/auxtrace.c | 7 +++++++ tools/perf/util/auxtrace.h | 2 ++ tools/perf/util/intel-bts.c | 5 +++++ tools/perf/util/intel-pt.c | 22 ++++++++++++++++++++-- 6 files changed, 49 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index be764f9ec7691..c6c8318e38a2e 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -672,6 +672,7 @@ The letters are: d create a debug log g synthesize a call chain (use with i or x) l synthesize last branch entries (use with i or x) + s skip initial number of events "Instructions" events look like they were recorded by "perf record -e instructions". @@ -730,6 +731,12 @@ from one sample to the next. To disable trace decoding entirely, use the option --no-itrace. +It is also possible to skip events generated (instructions, branches, transactions) +at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + +skips the first million instructions. dump option ----------- diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index 65453f4c70060..e2a4c5e0dbe5b 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -7,6 +7,7 @@ d create a debug log g synthesize a call chain (use with i or x) l synthesize last branch entries (use with i or x) + s skip initial number of events The default is all events i.e. the same as --itrace=ibxe @@ -24,3 +25,10 @@ Also the number of last branch entries (default 64, max. 1024) for instructions or transactions events can be specified. + + It is also possible to skip events generated (instructions, branches, transactions) + at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + + skips the first million instructions. diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ec164fe70718d..c9169011e55ef 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts) synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ; + synth_opts->initial_skip = 0; } /* @@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, synth_opts->last_branch_sz = val; } break; + case 's': + synth_opts->initial_skip = strtoul(p, &endptr, 10); + if (p == endptr) + goto out_err; + p = endptr; + break; case ' ': case ',': break; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 57ff31ecb8e40..767989e0e3126 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -68,6 +68,7 @@ enum itrace_period_type { * @last_branch_sz: branch context size * @period: 'instructions' events period * @period_type: 'instructions' events period type + * @initial_skip: skip N events at the beginning. */ struct itrace_synth_opts { bool set; @@ -86,6 +87,7 @@ struct itrace_synth_opts { unsigned int last_branch_sz; unsigned long long period; enum itrace_period_type period_type; + unsigned long initial_skip; }; /** diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index abf1366e2a24d..9df9960855633 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -66,6 +66,7 @@ struct intel_bts { u64 branches_id; size_t branches_event_size; bool synth_needs_swap; + unsigned long num_events; }; struct intel_bts_queue { @@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, union perf_event event; struct perf_sample sample = { .ip = 0, }; + if (bts->synth_opts.initial_skip && + bts->num_events++ <= bts->synth_opts.initial_skip) + return 0; + event.sample.header.type = PERF_RECORD_SAMPLE; event.sample.header.misc = PERF_RECORD_MISC_USER; event.sample.header.size = sizeof(struct perf_event_header); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 407f11b97c8dc..ddec87f6e6165 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -100,6 +100,8 @@ struct intel_pt { u64 cyc_bit; u64 noretcomp_bit; unsigned max_non_turbo_ratio; + + unsigned long num_events; }; enum switch_state { @@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) if (pt->branches_filter && !(pt->branches_filter & ptq->flags)) return 0; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1029,6 +1035,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) union perf_event *event = ptq->event_buf; struct perf_sample sample = { .ip = 0, }; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1087,6 +1097,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq) union perf_event *event = ptq->event_buf; struct perf_sample sample = { .ip = 0, }; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1199,14 +1213,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) ptq->have_sample = false; if (pt->sample_instructions && - (state->type & INTEL_PT_INSTRUCTION)) { + (state->type & INTEL_PT_INSTRUCTION) && + (!pt->synth_opts.initial_skip || + pt->num_events++ >= pt->synth_opts.initial_skip)) { err = intel_pt_synth_instruction_sample(ptq); if (err) return err; } if (pt->sample_transactions && - (state->type & INTEL_PT_TRANSACTION)) { + (state->type & INTEL_PT_TRANSACTION) && + (!pt->synth_opts.initial_skip || + pt->num_events++ >= pt->synth_opts.initial_skip)) { err = intel_pt_synth_transaction_sample(ptq); if (err) return err; -- GitLab From 4f7d6dd4df8b388e2056c89b528254cdd79dea2a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 29 Mar 2016 12:28:33 -0700 Subject: [PATCH 022/705] regmap: Fix implicit inclusion of device.h internal.h is using dev_name() but doesn't include device.h which defines it. Add an explicit include to avoid build problems due to this. Tested-by: Alexander Stein Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 5c79526245c2e..a0380338946a1 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -13,6 +13,7 @@ #ifndef _REGMAP_INTERNAL_H #define _REGMAP_INTERNAL_H +#include #include #include #include -- GitLab From 0dbdb76c0ca8e7caf27c9a210f64c4359e2974a4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 29 Mar 2016 12:30:44 -0700 Subject: [PATCH 023/705] regmap: mmio: Parse endianness definitions from DT Since we changed to do formatting in the bus we now skip all the format parsing that the core does for its data marshalling code. This means that we skip the DT parsing it does which breaks some systems, we need to add an explict call in the MMIO code to do this. Reported-by: Alexander Stein Tested-by: Alexander Stein Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-mmio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index b27573c69af74..7132a662c80d9 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -23,6 +23,8 @@ #include #include +#include "internal.h" + struct regmap_mmio_context { void __iomem *regs; unsigned val_bytes; @@ -245,7 +247,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, ctx->val_bytes = config->val_bits / 8; ctx->clk = ERR_PTR(-ENODEV); - switch (config->val_format_endian) { + switch (regmap_get_val_endian(dev, ®map_mmio, config)) { case REGMAP_ENDIAN_DEFAULT: case REGMAP_ENDIAN_LITTLE: #ifdef __LITTLE_ENDIAN -- GitLab From e633c65a1d5859da170a83d537d9762c07d12213 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 20 Mar 2016 01:33:36 -0700 Subject: [PATCH 024/705] x86/perf/intel/uncore: Make the Intel uncore PMU driver modular By default, the uncore driver will be built into the kernel. If it is configured as a module, the supported CPU model can be auto loaded. This patch also cleans up the code of uncore_cpu_init() and uncore_pci_init(). Based-on-a-patch-by: Thomas Gleixner Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458462817-2475-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 6 +- arch/x86/Kconfig.perf | 11 ++ arch/x86/events/Makefile | 9 +- arch/x86/events/intel/Makefile | 6 + arch/x86/events/intel/uncore.c | 216 +++++++++++++++++++-------------- 5 files changed, 148 insertions(+), 100 deletions(-) create mode 100644 arch/x86/Kconfig.perf create mode 100644 arch/x86/events/intel/Makefile diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a313c0e7e1655..496218b8236b4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -160,10 +160,6 @@ config INSTRUCTION_DECODER def_bool y depends on KPROBES || PERF_EVENTS || UPROBES -config PERF_EVENTS_INTEL_UNCORE - def_bool y - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI - config OUTPUT_FORMAT string default "elf32-i386" if X86_32 @@ -1042,6 +1038,8 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL +source "arch/x86/Kconfig.perf" + config X86_LEGACY_VM86 bool "Legacy VM86 support" default n diff --git a/arch/x86/Kconfig.perf b/arch/x86/Kconfig.perf new file mode 100644 index 0000000000000..90b7f5878c96b --- /dev/null +++ b/arch/x86/Kconfig.perf @@ -0,0 +1,11 @@ +menu "Performance monitoring" + +config PERF_EVENTS_INTEL_UNCORE + tristate "Intel uncore performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel uncore performance events. These are + available on NehalemEX and more modern processors. + +endmenu diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index f59618a399905..1d392c39fe560 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -6,9 +6,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o endif -obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o + +obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile new file mode 100644 index 0000000000000..a6c744871a739 --- /dev/null +++ b/arch/x86/events/intel/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += cstate.o ds.o knc.o +obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o +obj-$(CONFIG_CPU_SUP_INTEL) += rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7012d18bb2930..17734a6ef474c 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,3 +1,4 @@ +#include #include "uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; @@ -21,6 +22,8 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_LICENSE("GPL"); + static int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; @@ -754,7 +757,7 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) +static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) { struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; @@ -770,7 +773,7 @@ static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) } } -static void __init uncore_exit_boxes(void *dummy) +static void uncore_exit_boxes(void *dummy) { struct intel_uncore_type **types; @@ -787,7 +790,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) kfree(pmu->boxes); } -static void __init uncore_type_exit(struct intel_uncore_type *type) +static void uncore_type_exit(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmu = type->pmus; int i; @@ -804,7 +807,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->events_group = NULL; } -static void __init uncore_types_exit(struct intel_uncore_type **types) +static void uncore_types_exit(struct intel_uncore_type **types) { for (; *types; types++) uncore_type_exit(*types); @@ -989,46 +992,6 @@ static int __init uncore_pci_init(void) size_t size; int ret; - switch (boot_cpu_data.x86_model) { - case 45: /* Sandy Bridge-EP */ - ret = snbep_uncore_pci_init(); - break; - case 62: /* Ivy Bridge-EP */ - ret = ivbep_uncore_pci_init(); - break; - case 63: /* Haswell-EP */ - ret = hswep_uncore_pci_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - ret = bdx_uncore_pci_init(); - break; - case 42: /* Sandy Bridge */ - ret = snb_uncore_pci_init(); - break; - case 58: /* Ivy Bridge */ - ret = ivb_uncore_pci_init(); - break; - case 60: /* Haswell */ - case 69: /* Haswell Celeron */ - ret = hsw_uncore_pci_init(); - break; - case 61: /* Broadwell */ - ret = bdw_uncore_pci_init(); - break; - case 87: /* Knights Landing */ - ret = knl_uncore_pci_init(); - break; - case 94: /* SkyLake */ - ret = skl_uncore_pci_init(); - break; - default: - return -ENODEV; - } - - if (ret) - return ret; - size = max_packages * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { @@ -1060,7 +1023,7 @@ static int __init uncore_pci_init(void) return ret; } -static void __init uncore_pci_exit(void) +static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; @@ -1287,46 +1250,6 @@ static int __init uncore_cpu_init(void) { int ret; - switch (boot_cpu_data.x86_model) { - case 26: /* Nehalem */ - case 30: - case 37: /* Westmere */ - case 44: - nhm_uncore_cpu_init(); - break; - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell */ - case 70: /* Haswell */ - case 61: /* Broadwell */ - case 71: /* Broadwell */ - snb_uncore_cpu_init(); - break; - case 45: /* Sandy Bridge-EP */ - snbep_uncore_cpu_init(); - break; - case 46: /* Nehalem-EX */ - case 47: /* Westmere-EX aka. Xeon E7 */ - nhmex_uncore_cpu_init(); - break; - case 62: /* Ivy Bridge-EP */ - ivbep_uncore_cpu_init(); - break; - case 63: /* Haswell-EP */ - hswep_uncore_cpu_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - bdx_uncore_cpu_init(); - break; - case 87: /* Knights Landing */ - knl_uncore_cpu_init(); - break; - default: - return -ENODEV; - } - ret = uncore_types_init(uncore_msr_uncores, true); if (ret) goto err; @@ -1376,11 +1299,105 @@ static int __init uncore_cpumask_init(bool msr) return 0; } +#define X86_UNCORE_MODEL_MATCH(model, init) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } + +struct intel_uncore_init_fun { + void (*cpu_init)(void); + int (*pci_init)(void); +}; + +static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { + .cpu_init = nhm_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun snb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = snb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun ivb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = ivb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hsw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = hsw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = bdw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun snbep_uncore_init __initconst = { + .cpu_init = snbep_uncore_cpu_init, + .pci_init = snbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = { + .cpu_init = nhmex_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = { + .cpu_init = ivbep_uncore_cpu_init, + .pci_init = ivbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hswep_uncore_init __initconst = { + .cpu_init = hswep_uncore_cpu_init, + .pci_init = hswep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdx_uncore_init __initconst = { + .cpu_init = bdx_uncore_cpu_init, + .pci_init = bdx_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun knl_uncore_init __initconst = { + .cpu_init = knl_uncore_cpu_init, + .pci_init = knl_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun skl_uncore_init __initconst = { + .pci_init = skl_uncore_pci_init, +}; + +static const struct x86_cpu_id intel_uncore_match[] __initconst = { + X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */ + X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */ + X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */ + X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */ + X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */ + X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */ + X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */ + X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */ + X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */ + X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */ + X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */ + X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */ + {}, +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); + static int __init intel_uncore_init(void) { - int pret, cret, ret; + const struct x86_cpu_id *id; + struct intel_uncore_init_fun *uncore_init; + int pret = 0, cret = 0, ret; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + id = x86_match_cpu(intel_uncore_match); + if (!id) return -ENODEV; if (cpu_has_hypervisor) @@ -1388,8 +1405,17 @@ static int __init intel_uncore_init(void) max_packages = topology_max_packages(); - pret = uncore_pci_init(); - cret = uncore_cpu_init(); + uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + if (uncore_init->pci_init) { + pret = uncore_init->pci_init(); + if (!pret) + pret = uncore_pci_init(); + } + + if (uncore_init->cpu_init) { + uncore_init->cpu_init(); + cret = uncore_cpu_init(); + } if (cret && pret) return -ENODEV; @@ -1409,4 +1435,14 @@ static int __init intel_uncore_init(void) cpu_notifier_register_done(); return ret; } -device_initcall(intel_uncore_init); +module_init(intel_uncore_init); + +static void __exit intel_uncore_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&uncore_cpu_nb); + uncore_types_exit(uncore_msr_uncores); + uncore_pci_exit(); + cpu_notifier_register_done(); +} +module_exit(intel_uncore_exit); -- GitLab From 4b6e2571bf00019e016255ad62b56feb9f498db7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sat, 19 Mar 2016 00:20:50 -0700 Subject: [PATCH 025/705] x86/perf/intel/rapl: Make the Intel RAPL PMU driver modular By default, the RAPL driver will be built into the kernel. If it is configured as a module, the supported CPU model can be auto loaded. Also clean up the code of rapl_pmu_init(). Based-on-a-patch-by: Thomas Gleixner Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458372050-2420-2-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.perf | 8 +++ arch/x86/events/intel/Makefile | 3 +- arch/x86/events/intel/rapl.c | 121 ++++++++++++++++++++++----------- 3 files changed, 92 insertions(+), 40 deletions(-) diff --git a/arch/x86/Kconfig.perf b/arch/x86/Kconfig.perf index 90b7f5878c96b..b239ad5d0a4e9 100644 --- a/arch/x86/Kconfig.perf +++ b/arch/x86/Kconfig.perf @@ -8,4 +8,12 @@ config PERF_EVENTS_INTEL_UNCORE Include support for Intel uncore performance events. These are available on NehalemEX and more modern processors. +config PERF_EVENTS_INTEL_RAPL + tristate "Intel rapl performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel rapl performance events for power + monitoring on modern processors. + endmenu diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index a6c744871a739..27adbbab99104 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += cstate.o ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_CPU_SUP_INTEL) += rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o +intel-rapl-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 70c93f9b03acc..e657de1923c25 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -53,6 +53,8 @@ #include #include "../perf_event.h" +MODULE_LICENSE("GPL"); + /* * RAPL energy status counters */ @@ -592,6 +594,11 @@ static int rapl_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static struct notifier_block rapl_cpu_nb = { + .notifier_call = rapl_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -660,7 +667,7 @@ static int __init rapl_prepare_cpus(void) return 0; } -static void __init cleanup_rapl_pmus(void) +static void cleanup_rapl_pmus(void) { int i; @@ -691,51 +698,77 @@ static int __init init_rapl_pmus(void) return 0; } +#define X86_RAPL_MODEL_MATCH(model, init) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } + +struct intel_rapl_init_fun { + bool apply_quirk; + int cntr_mask; + struct attribute **attrs; +}; + +static const struct intel_rapl_init_fun snb_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_CLN, + .attrs = rapl_events_cln_attr, +}; + +static const struct intel_rapl_init_fun hsx_rapl_init __initconst = { + .apply_quirk = true, + .cntr_mask = RAPL_IDX_SRV, + .attrs = rapl_events_srv_attr, +}; + +static const struct intel_rapl_init_fun hsw_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_HSW, + .attrs = rapl_events_hsw_attr, +}; + +static const struct intel_rapl_init_fun snbep_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_SRV, + .attrs = rapl_events_srv_attr, +}; + +static const struct intel_rapl_init_fun knl_rapl_init __initconst = { + .apply_quirk = true, + .cntr_mask = RAPL_IDX_KNL, + .attrs = rapl_events_knl_attr, +}; + static const struct x86_cpu_id rapl_cpu_match[] __initconst = { - [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, - [1] = {}, + X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */ + X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */ + X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */ + X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ + X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */ + X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */ + X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */ + X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */ + X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */ + X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */ + X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */ + {}, }; +MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match); + static int __init rapl_pmu_init(void) { - bool apply_quirk = false; + const struct x86_cpu_id *id; + struct intel_rapl_init_fun *rapl_init; + bool apply_quirk; int ret; - if (!x86_match_cpu(rapl_cpu_match)) + id = x86_match_cpu(rapl_cpu_match); + if (!id) return -ENODEV; - switch (boot_cpu_data.x86_model) { - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - rapl_cntr_mask = RAPL_IDX_CLN; - rapl_pmu_events_group.attrs = rapl_events_cln_attr; - break; - case 63: /* Haswell-Server */ - case 79: /* Broadwell-Server */ - apply_quirk = true; - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ - case 61: /* Broadwell */ - case 71: /* Broadwell-H */ - rapl_cntr_mask = RAPL_IDX_HSW; - rapl_pmu_events_group.attrs = rapl_events_hsw_attr; - break; - case 45: /* Sandy Bridge-EP */ - case 62: /* IvyTown */ - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 87: /* Knights Landing */ - apply_quirk = true; - rapl_cntr_mask = RAPL_IDX_KNL; - rapl_pmu_events_group.attrs = rapl_events_knl_attr; - break; - default: - return -ENODEV; - } + rapl_init = (struct intel_rapl_init_fun *)id->driver_data; + apply_quirk = rapl_init->apply_quirk; + rapl_cntr_mask = rapl_init->cntr_mask; + rapl_pmu_events_group.attrs = rapl_init->attrs; ret = rapl_check_hw_unit(apply_quirk); if (ret) @@ -755,7 +788,7 @@ static int __init rapl_pmu_init(void) if (ret) goto out; - __perf_cpu_notifier(rapl_cpu_notifier); + __register_cpu_notifier(&rapl_cpu_nb); cpu_notifier_register_done(); rapl_advertise(); return 0; @@ -766,4 +799,14 @@ static int __init rapl_pmu_init(void) cpu_notifier_register_done(); return ret; } -device_initcall(rapl_pmu_init); +module_init(rapl_pmu_init); + +static void __exit intel_rapl_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&rapl_cpu_nb); + perf_pmu_unregister(&rapl_pmus->pmu); + cleanup_rapl_pmus(); + cpu_notifier_register_done(); +} +module_exit(intel_rapl_exit); -- GitLab From 49de0493e5f67a8023fa6fa5c89097c1f77de74e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 20 Mar 2016 18:59:02 +0000 Subject: [PATCH 026/705] x86/perf/intel/cstate: Make cstate hotplug handling actually work The current implementation aside of being an incomprehensible mess is broken. # cat /sys/bus/event_source/devices/cstate_core/cpumask 0-17 That's on a quad socket machine with 72 physical cores! Qualitee stuff. So it's not a surprise that event migration in case of CPU hotplug does not work either. # perf stat -e cstate_core/c6-residency/ -C 1 sleep 60 & # echo 0 >/sys/devices/system/cpu/cpu1/online Tracing cstate_pmu_event_update gives me: [001] cstate_pmu_event_update <-event_sched_out After the fix it properly moves the event: [001] cstate_pmu_event_update <-event_sched_out [073] cstate_pmu_event_update <-__perf_event_read [073] cstate_pmu_event_update <-event_sched_out The migration of pkg events does not work either. Not that I'm surprised. I really could not be bothered to decode that loop mess and simply replaced it by querying the proper cpumasks which give us the answer in a comprehensible way. This also requires to direct the event to the current active reader CPU in cstate_pmu_event_init() otherwise the hotplug logic can't work. Signed-off-by: Thomas Gleixner [ Added event->cpu < 0 test to not explode] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160320185623.422519970@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 122 ++++++++++++++------------------- 1 file changed, 53 insertions(+), 69 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 7946c4231169f..5c2f55fe142ad 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -385,7 +385,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, static int cstate_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config; - int ret = 0; + int cpu; if (event->attr.type != event->pmu->type) return -ENOENT; @@ -400,26 +400,36 @@ static int cstate_pmu_event_init(struct perf_event *event) event->attr.sample_period) /* no sampling */ return -EINVAL; + if (event->cpu < 0) + return -EINVAL; + if (event->pmu == &cstate_core_pmu) { if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) return -EINVAL; if (!core_msr[cfg].attr) return -EINVAL; event->hw.event_base = core_msr[cfg].msr; + cpu = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(event->cpu)); } else if (event->pmu == &cstate_pkg_pmu) { if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; if (!pkg_msr[cfg].attr) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; - } else + cpu = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(event->cpu)); + } else { return -ENOENT; + } - /* must be done before validate_group */ + if (cpu >= nr_cpu_ids) + return -ENODEV; + + event->cpu = cpu; event->hw.config = cfg; event->hw.idx = -1; - - return ret; + return 0; } static inline u64 cstate_pmu_read_counter(struct perf_event *event) @@ -469,102 +479,76 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode) return 0; } +/* + * Check if exiting cpu is the designated reader. If so migrate the + * events when there is a valid target available + */ static void cstate_cpu_exit(int cpu) { - int i, id, target; + unsigned int target; - /* cpu exit for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_core_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + if (has_cstate_core && + cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) { + + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_core_cpu_mask); - WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } } - /* cpu exit for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_physical_package_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + if (has_cstate_pkg && + cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { + + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_pkg_cpu_mask); - WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } } } static void cstate_cpu_init(int cpu) { - int i, id; + unsigned int target; - /* cpu init for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - for_each_cpu(i, &cstate_core_cpu_mask) { - if (id == topology_core_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_core_cpu_mask); - } + /* + * If this is the first online thread of that core, set it in + * the core cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(cpu)); - /* cpu init for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - for_each_cpu(i, &cstate_pkg_cpu_mask) { - if (id == topology_physical_package_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); - } + if (has_cstate_core && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + + /* + * If this is the first online thread of that package, set it + * in the package cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(cpu)); + if (has_cstate_pkg && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); } static int cstate_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - break; case CPU_STARTING: cstate_cpu_init(cpu); break; - case CPU_UP_CANCELED: - case CPU_DYING: - break; - case CPU_ONLINE: - case CPU_DEAD: - break; case CPU_DOWN_PREPARE: cstate_cpu_exit(cpu); break; default: break; } - return NOTIFY_OK; } -- GitLab From 424646eeadab64da959f960928804e5289417819 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 20 Mar 2016 18:59:03 +0000 Subject: [PATCH 027/705] x86/perf/intel/cstate: Sanitize probing The whole probing functionality can simply be expressed with model matching and a bunch of structures describing the variants. This is a first step to make that driver modular. While at it, get rid of completely pointless comments and name the enums so they are self explaining. Signed-off-by: Thomas Gleixner [ Reworked probing to clear msr[].attr for all !present msrs. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160320185623.500381872@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 359 +++++++++++++++------------------ 1 file changed, 160 insertions(+), 199 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 5c2f55fe142ad..1aac40f1e4fe8 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -106,22 +106,27 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf); +/* Model -> events mapping */ +struct cstate_model { + unsigned long core_events; + unsigned long pkg_events; + unsigned long quirks; +}; + +/* Quirk flags */ +#define SLM_PKG_C6_USE_C7_MSR (1UL << 0) + struct perf_cstate_msr { u64 msr; struct perf_pmu_events_attr *attr; - bool (*test)(int idx); }; /* cstate_core PMU */ - static struct pmu cstate_core_pmu; static bool has_cstate_core; -enum perf_cstate_core_id { - /* - * cstate_core events - */ +enum perf_cstate_core_events { PERF_CSTATE_CORE_C1_RES = 0, PERF_CSTATE_CORE_C3_RES, PERF_CSTATE_CORE_C6_RES, @@ -130,69 +135,16 @@ enum perf_cstate_core_id { PERF_CSTATE_CORE_EVENT_MAX, }; -bool test_core(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C1_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 }, }; static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { @@ -234,18 +186,11 @@ static const struct attribute_group *core_attr_groups[] = { NULL, }; -/* cstate_core PMU end */ - - /* cstate_pkg PMU */ - static struct pmu cstate_pkg_pmu; static bool has_cstate_pkg; -enum perf_cstate_pkg_id { - /* - * cstate_pkg events - */ +enum perf_cstate_pkg_events { PERF_CSTATE_PKG_C2_RES = 0, PERF_CSTATE_PKG_C3_RES, PERF_CSTATE_PKG_C6_RES, @@ -257,69 +202,6 @@ enum perf_cstate_pkg_id { PERF_CSTATE_PKG_EVENT_MAX, }; -bool test_pkg(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 69: /* 22nm Haswell ULT */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES || - idx == PERF_CSTATE_PKG_C8_RES || - idx == PERF_CSTATE_PKG_C9_RES || - idx == PERF_CSTATE_PKG_C10_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); @@ -329,13 +211,13 @@ PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 }, }; static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { @@ -366,8 +248,6 @@ static const struct attribute_group *pkg_attr_groups[] = { NULL, }; -/* cstate_pkg PMU end*/ - static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -552,48 +432,151 @@ static int cstate_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static struct pmu cstate_core_pmu = { + .attr_groups = core_attr_groups, + .name = "cstate_core", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static struct pmu cstate_pkg_pmu = { + .attr_groups = pkg_attr_groups, + .name = "cstate_pkg", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static const struct cstate_model nhm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model snb_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model hswult_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + +static const struct cstate_model slm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + .quirks = SLM_PKG_C6_USE_C7_MSR, +}; + +#define X86_CSTATES_MODEL(model, states) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } + +static const struct x86_cpu_id intel_cstates_match[] __initconst = { + X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */ + X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */ + X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */ + + X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */ + X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */ + X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */ + + X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */ + X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */ + + X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */ + X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */ + + X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */ + X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */ + X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */ + + X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */ + + X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */ + X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */ + X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */ + + X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */ + X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */ + X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */ + X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */ + + X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */ + X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */ + { }, +}; +MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); + /* * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there is no available events. + * Return false if there are no available events. */ -static bool cstate_probe_msr(struct perf_cstate_msr *msr, - struct attribute **events_attrs, - int max_event_nr) +static bool __init cstate_probe_msr(const unsigned long evmsk, int max, + struct perf_cstate_msr *msr, + struct attribute **attrs) { - int i, j = 0; + bool found = false; + unsigned int bit; u64 val; - /* Probe the cstate events. */ - for (i = 0; i < max_event_nr; i++) { - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining events in the sysfs attrs. */ - for (i = 0; i < max_event_nr; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; + for (bit = 0; bit < max; bit++) { + if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) { + *attrs++ = &msr[bit].attr->attr.attr; + found = true; + } else { + msr[bit].attr = NULL; + } } - events_attrs[j] = NULL; + *attrs = NULL; - return (j > 0) ? true : false; + return found; } -static int __init cstate_init(void) +static int __init cstate_probe(const struct cstate_model *cm) { /* SLM has different MSR for PKG C6 */ - switch (boot_cpu_data.x86_model) { - case 55: - case 76: - case 77: + if (cm->quirks & SLM_PKG_C6_USE_C7_MSR) pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; - } - if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) - has_cstate_core = true; + has_cstate_core = cstate_probe_msr(cm->core_events, + PERF_CSTATE_CORE_EVENT_MAX, + core_msr, core_events_attrs); - if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) - has_cstate_pkg = true; + has_cstate_pkg = cstate_probe_msr(cm->pkg_events, + PERF_CSTATE_PKG_EVENT_MAX, + pkg_msr, pkg_events_attrs); return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; } @@ -612,32 +595,6 @@ static void __init cstate_cpumask_init(void) cpu_notifier_register_done(); } -static struct pmu cstate_core_pmu = { - .attr_groups = core_attr_groups, - .name = "cstate_core", - .task_ctx_nr = perf_invalid_context, - .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ - .start = cstate_pmu_event_start, - .stop = cstate_pmu_event_stop, - .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, -}; - -static struct pmu cstate_pkg_pmu = { - .attr_groups = pkg_attr_groups, - .name = "cstate_pkg", - .task_ctx_nr = perf_invalid_context, - .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ - .start = cstate_pmu_event_start, - .stop = cstate_pmu_event_stop, - .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, -}; - static void __init cstate_pmus_register(void) { int err; @@ -659,12 +616,17 @@ static void __init cstate_pmus_register(void) static int __init cstate_pmu_init(void) { + const struct x86_cpu_id *id; int err; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - err = cstate_init(); + id = x86_match_cpu(intel_cstates_match); + if (!id) + return -ENODEV; + + err = cstate_probe((const struct cstate_model *) id->driver_data); if (err) return err; @@ -674,5 +636,4 @@ static int __init cstate_pmu_init(void) return 0; } - device_initcall(cstate_pmu_init); -- GitLab From d29859e7777ebc2c8e2db6e4d8e299f50fc26414 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 20 Mar 2016 18:59:03 +0000 Subject: [PATCH 028/705] x86/perf/intel/cstate: Sanitize error handling There is no point in WARN_ON() inside of a well known init function. We already know the call stack and it's really not of critical importance whether the registration of a PMU fails. Aside of that for consistency reasons it's just pointless to try to register another PMU if the first register attempt failed. There is also no value in keeping one PMU if the second one can not be registered. Make it consistent so we can finaly modularize the driver. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160320185623.579794064@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 50 ++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 1aac40f1e4fe8..e90ec9e73ac71 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -581,37 +581,45 @@ static int __init cstate_probe(const struct cstate_model *cm) return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; } -static void __init cstate_cpumask_init(void) +static void __init cstate_cleanup(void) { - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - cstate_cpu_init(cpu); + if (has_cstate_core) + perf_pmu_unregister(&cstate_core_pmu); - __perf_cpu_notifier(cstate_cpu_notifier); - - cpu_notifier_register_done(); + if (has_cstate_pkg) + perf_pmu_unregister(&cstate_pkg_pmu); } -static void __init cstate_pmus_register(void) +static int __init cstate_init(void) { - int err; + int cpu, err; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_core_pmu.name, err); + if (err) { + has_cstate_core = false; + pr_info("Failed to register cstate core pmu\n"); + goto out; + } } if (has_cstate_pkg) { err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_pkg_pmu.name, err); + if (err) { + has_cstate_pkg = false; + pr_info("Failed to register cstate pkg pmu\n"); + cstate_cleanup(); + goto out; + } } + __perf_cpu_notifier(cstate_cpu_notifier); +out: + cpu_notifier_register_done(); + return err; } static int __init cstate_pmu_init(void) @@ -630,10 +638,6 @@ static int __init cstate_pmu_init(void) if (err) return err; - cstate_cpumask_init(); - - cstate_pmus_register(); - - return 0; + return cstate_init(); } device_initcall(cstate_pmu_init); -- GitLab From c7afba320e91cca46fdf078798002b9ec84be8d3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 20 Mar 2016 18:59:04 +0000 Subject: [PATCH 029/705] x86/perf/intel/cstate: Modularize driver Add the exit function and allow the driver to be built as a module. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160320185623.658869675@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.perf | 8 ++++++++ arch/x86/events/intel/Makefile | 4 +++- arch/x86/events/intel/cstate.c | 22 +++++++++++++++++++--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig.perf b/arch/x86/Kconfig.perf index b239ad5d0a4e9..7d29dd75d07b6 100644 --- a/arch/x86/Kconfig.perf +++ b/arch/x86/Kconfig.perf @@ -16,4 +16,12 @@ config PERF_EVENTS_INTEL_RAPL Include support for Intel rapl performance events for power monitoring on modern processors. +config PERF_EVENTS_INTEL_CSTATE + tristate "Intel cstate performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel cstate performance events for power + monitoring on modern processors. + endmenu diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 27adbbab99104..3660b2cf245ad 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,7 +1,9 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += cstate.o ds.o knc.o +obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o intel-rapl-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o +intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e90ec9e73ac71..9ba4e4136a153 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -91,6 +91,8 @@ #include #include "../perf_event.h" +MODULE_LICENSE("GPL"); + #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ @@ -432,6 +434,11 @@ static int cstate_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static struct notifier_block cstate_cpu_nb = { + .notifier_call = cstate_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, .name = "cstate_core", @@ -581,7 +588,7 @@ static int __init cstate_probe(const struct cstate_model *cm) return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; } -static void __init cstate_cleanup(void) +static inline void cstate_cleanup(void) { if (has_cstate_core) perf_pmu_unregister(&cstate_core_pmu); @@ -616,7 +623,7 @@ static int __init cstate_init(void) goto out; } } - __perf_cpu_notifier(cstate_cpu_notifier); + __register_cpu_notifier(&cstate_cpu_nb); out: cpu_notifier_register_done(); return err; @@ -640,4 +647,13 @@ static int __init cstate_pmu_init(void) return cstate_init(); } -device_initcall(cstate_pmu_init); +module_init(cstate_pmu_init); + +static void __exit cstate_pmu_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&cstate_cpu_nb); + cstate_cleanup(); + cpu_notifier_register_done(); +} +module_exit(cstate_pmu_exit); -- GitLab From 8a22426184774d7ced9c1d3aa4d95d34101fb3be Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 29 Jan 2016 16:29:56 +0800 Subject: [PATCH 030/705] perf/x86/msr: Add AMD PTSC (Performance Time-Stamp Counter) support AMD Carrizo (Family 15h, Model 60h) introduces a time-stamp counter which is indicated by CPUID.8000_0001H:ECX[27]. It increments at a 100 MHz rate in all P-states, and C states, S0, or S1. The frequency is about 100MHz. This counter will be used to calculate processor power and other parts. So add an interface into the MSR PMU to get the PTSC counter value. Signed-off-by: Huang Rui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Fengguang Wu Cc: Jacob Shin Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1454056197-5893-2-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar --- arch/x86/events/msr.c | 8 ++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 1 + 3 files changed, 10 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ec863b9a9f780..6f6772f273aa9 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -6,6 +6,7 @@ enum perf_msr_id { PERF_MSR_MPERF = 2, PERF_MSR_PPERF = 3, PERF_MSR_SMI = 4, + PERF_MSR_PTSC = 5, PERF_MSR_EVENT_MAX, }; @@ -15,6 +16,11 @@ static bool test_aperfmperf(int idx) return boot_cpu_has(X86_FEATURE_APERFMPERF); } +static bool test_ptsc(int idx) +{ + return boot_cpu_has(X86_FEATURE_PTSC); +} + static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -74,6 +80,7 @@ PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); static struct perf_msr msr[] = { [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, @@ -81,6 +88,7 @@ static struct perf_msr msr[] = { [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 44ebd04878ebd..bdf9042f0295d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,6 +177,7 @@ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 984ab75bf6218..6e6a5ccfb3f54 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -326,6 +326,7 @@ #define MSR_F15H_PERF_CTR 0xc0010201 #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 +#define MSR_F15H_PTSC 0xc0010280 #define MSR_F15H_IC_CFG 0xc0011021 /* Fam 10h MSRs */ -- GitLab From aaf248848db503927644d28e239bc399ed45959f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 29 Jan 2016 16:29:57 +0800 Subject: [PATCH 031/705] perf/x86/msr: Add AMD IRPERF (Instructions Retired) performance counter AMD Zeppelin (Family 17h, Model 00h) introduces an instructions retired performance counter which is indicated by CPUID.8000_0008H:EBX[1]. A dedicated Instructions Retired MSR register (MSR 0xC000_000E9) increments once for every instruction retired. Signed-off-by: Huang Rui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Borislav Petkov Cc: Fengguang Wu Cc: Jacob Shin Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1454056197-5893-3-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar --- arch/x86/events/msr.c | 30 +++++++++++++++++++----------- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 3 +++ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6f6772f273aa9..7111400a1f9a0 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -7,6 +7,7 @@ enum perf_msr_id { PERF_MSR_PPERF = 3, PERF_MSR_SMI = 4, PERF_MSR_PTSC = 5, + PERF_MSR_IRPERF = 6, PERF_MSR_EVENT_MAX, }; @@ -21,6 +22,11 @@ static bool test_ptsc(int idx) return boot_cpu_has(X86_FEATURE_PTSC); } +static bool test_irperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_IRPERF); +} + static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -75,20 +81,22 @@ struct perf_msr { bool (*test)(int idx); }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); -PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); +PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06"); static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bdf9042f0295d..dd448a91182e8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -251,6 +251,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6e6a5ccfb3f54..e0e2f7dfbd363 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -313,6 +313,9 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +/* Fam 17h MSRs */ +#define MSR_F17H_IRPERF 0xc00000e9 + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 -- GitLab From 07dc900e17a94681877b5797ce62ba97fa170400 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Mar 2016 14:30:35 +0200 Subject: [PATCH 032/705] perf/x86: Move Kconfig.perf and other perf configuration bits to events/Kconfig Ingo says: "If we do a separate file we should have it in arch/x86/events/Kconfig (not in arch/x86/Kconfig.perf), and also move some of the other bits, such as PERF_EVENTS_AMD_POWER?" Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 11 +---------- arch/x86/{Kconfig.perf => events/Kconfig} | 9 +++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) rename arch/x86/{Kconfig.perf => events/Kconfig} (68%) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 496218b8236b4..c2d34578a0a4b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1038,7 +1038,7 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL -source "arch/x86/Kconfig.perf" +source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 bool "Legacy VM86 support" @@ -1204,15 +1204,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config PERF_EVENTS_AMD_POWER - depends on PERF_EVENTS && CPU_SUP_AMD - tristate "AMD Processor Power Reporting Mechanism" - ---help--- - Provide power reporting mechanism support for AMD processors. - Currently, it leverages X86_FEATURE_ACC_POWER - (CPUID Fn8000_0007_EDX[12]) interface to calculate the - average power consumption on Family 15h processors. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- diff --git a/arch/x86/Kconfig.perf b/arch/x86/events/Kconfig similarity index 68% rename from arch/x86/Kconfig.perf rename to arch/x86/events/Kconfig index 7d29dd75d07b6..98397db5ceaec 100644 --- a/arch/x86/Kconfig.perf +++ b/arch/x86/events/Kconfig @@ -24,4 +24,13 @@ config PERF_EVENTS_INTEL_CSTATE Include support for Intel cstate performance events for power monitoring on modern processors. +config PERF_EVENTS_AMD_POWER + depends on PERF_EVENTS && CPU_SUP_AMD + tristate "AMD Processor Power Reporting Mechanism" + ---help--- + Provide power reporting mechanism support for AMD processors. + Currently, it leverages X86_FEATURE_ACC_POWER + (CPUID Fn8000_0007_EDX[12]) interface to calculate the + average power consumption on Family 15h processors. + endmenu -- GitLab From 26657848502b78474a5f17f9ce2ae6dc8d8d6262 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Mar 2016 22:09:18 +0100 Subject: [PATCH 033/705] perf/core: Verify we have a single perf_hw_context PMU There should (and can) only be a single PMU for perf_hw_context events. This is because of how we schedule events: once a hardware event fails to schedule (the PMU is 'full') we stop trying to add more. The trivial 'fix' would break the Round-Robin scheduling we do. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 52bedc5a5aaa1..525d11c59287b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7693,6 +7693,15 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + if (WARN_ON_ONCE(hw_context_taken)) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; -- GitLab From dcb10a967ce82d5ad20570693091139ae716ff76 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Mar 2016 15:42:45 +0200 Subject: [PATCH 034/705] perf/ring_buffer: Refuse to begin AUX transaction after rb->aux_mmap_count drops When ring buffer's AUX area is unmapped and rb->aux_mmap_count drops to zero, new AUX transactions into this buffer can still be started, even though the buffer in en route to deallocation. This patch adds a check to perf_aux_output_begin() for rb->aux_mmap_count being zero, in which case there is no point starting new transactions, in other words, the ring buffers that pass a certain point in perf_mmap_close will not have their events sending new data, which clears path for freeing those buffers' pages right there and then, provided that no active transactions are holding the AUX reference. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1457098969-21595-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c61f0cbd308b5..89abf623e93c6 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -287,6 +287,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) goto err; + /* + * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), + * the aux buffer is in perf_mmap_close(), about to get freed. + */ + if (!atomic_read(&rb->aux_mmap_count)) + goto err; + /* * Nesting is not supported for AUX area, make sure nested * writers are caught early -- GitLab From 95ff4ca26c492fc1ed7751f5dd7ab7674b54f4e0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 2 Dec 2015 18:41:11 +0200 Subject: [PATCH 035/705] perf/core: Free AUX pages in unmap path Now that we can ensure that when ring buffer's AUX area is on the way to getting unmapped new transactions won't start, we only need to stop all events that can potentially be writing aux data to our ring buffer. Having done that, we can safely free the AUX pages and corresponding PMU data, as this time it is guaranteed to be the last aux reference holder. This partially reverts: 57ffc5ca679 ("perf: Fix AUX buffer refcounting") ... which was made to defer deallocation that was otherwise possible from an NMI context. Now it is no longer the case; the last call to rb_free_aux() that drops the last AUX reference has to happen in perf_mmap_close() on that AUX area. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/87d1qtz23d.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 120 +++++++++++++++++++++++++++++++++++- kernel/events/internal.h | 1 - kernel/events/ring_buffer.c | 37 ++++------- 3 files changed, 129 insertions(+), 29 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 525d11c59287b..243df4b628701 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1925,8 +1925,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2358,6 +2363,29 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +static int __perf_event_stop(void *info) +{ + struct perf_event *event = info; + + /* for AUX events, our job is done if the event is already inactive */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + return 0; +} + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -4667,6 +4695,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4694,10 +4724,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5768,6 +5810,80 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, rcu_read_unlock(); } +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(event); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c3..2b229fdcfc099 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,7 +11,6 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 89abf623e93c6..367e9c56ec0bc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -221,8 +221,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -243,16 +241,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} - -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); } /* @@ -292,7 +280,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the aux buffer is in perf_mmap_close(), about to get freed. */ if (!atomic_read(&rb->aux_mmap_count)) - goto err; + goto err_put; /* * Nesting is not supported for AUX area, make sure nested @@ -338,7 +326,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -389,7 +377,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -470,6 +458,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -581,18 +577,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC -- GitLab From af5bb4ed1254a378b6028c09e58bdcc1cd9bf5b3 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Mar 2016 15:42:47 +0200 Subject: [PATCH 036/705] perf/ring_buffer: Document AUX API usage In order to ensure safe AUX buffer management, we rely on the assumption that pmu::stop() stops its ongoing AUX transaction and not just the hw. This patch documents this requirement for the perf_aux_output_{begin,end}() APIs. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1457098969-21595-4-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 367e9c56ec0bc..0ed4555309bd3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -252,6 +252,10 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -323,6 +327,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: @@ -337,6 +342,10 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) @@ -376,6 +385,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); } -- GitLab From 66d219014a4ee47ad4ca2b9db5fe6547353e2a56 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Mar 2016 15:42:48 +0200 Subject: [PATCH 037/705] perf/x86/intel/pt: Move transaction start/stop to PMU start/stop callbacks As per AUX buffer management requirement, AUX output has to happen between pmu::start and pmu::stop calls so that perf_event_stop() actually stops it and therefore perf can free the AUX data after it has called pmu::stop. This patch moves perf_aux_output_{begin,end} from pt_event_{add,del} to pt_event_{start,stop}. As a bonus, we get rid of pt_buffer_is_full(), which is already taken care of by perf_aux_output_begin() anyway. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1457098969-21595-5-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 85 ++++++++++++-------------------------- 1 file changed, 27 insertions(+), 58 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 6af7cf71d6b2e..127f58c179767 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -905,26 +905,6 @@ static void pt_buffer_free_aux(void *data) kfree(buf); } -/** - * pt_buffer_is_full() - check if the buffer is full - * @buf: PT buffer. - * @pt: Per-cpu pt handle. - * - * If the user hasn't read data from the output region that aux_head - * points to, the buffer is considered full: the user needs to read at - * least this region and update aux_tail to point past it. - */ -static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= pt->handle.size) - return true; - - return false; -} - /** * intel_pt_interrupt() - PT PMI handler */ @@ -989,20 +969,33 @@ void intel_pt_interrupt(void) static void pt_event_start(struct perf_event *event, int mode) { + struct hw_perf_event *hwc = &event->hw; struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf = perf_get_aux(&pt->handle); + struct pt_buffer *buf; - if (!buf || pt_buffer_is_full(buf, pt)) { - event->hw.state = PERF_HES_STOPPED; - return; + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) + goto fail_stop; + + pt_buffer_reset_offsets(buf, pt->handle.head); + if (!buf->snapshot) { + if (pt_buffer_reset_markers(buf, &pt->handle)) + goto fail_end_stop; } ACCESS_ONCE(pt->handle_nmi) = 1; - event->hw.state = 0; + hwc->state = 0; pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); pt_config(event); + + return; + +fail_end_stop: + perf_aux_output_end(&pt->handle, 0, true); +fail_stop: + hwc->state = PERF_HES_STOPPED; } static void pt_event_stop(struct perf_event *event, int mode) @@ -1035,19 +1028,7 @@ static void pt_event_stop(struct perf_event *event, int mode) pt_handle_status(pt); pt_update_head(pt); - } -} -static void pt_event_del(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf; - - pt_event_stop(event, PERF_EF_UPDATE); - - buf = perf_get_aux(&pt->handle); - - if (buf) { if (buf->snapshot) pt->handle.head = local_xchg(&buf->data_size, @@ -1057,9 +1038,13 @@ static void pt_event_del(struct perf_event *event, int mode) } } +static void pt_event_del(struct perf_event *event, int mode) +{ + pt_event_stop(event, PERF_EF_UPDATE); +} + static int pt_event_add(struct perf_event *event, int mode) { - struct pt_buffer *buf; struct pt *pt = this_cpu_ptr(&pt_ctx); struct hw_perf_event *hwc = &event->hw; int ret = -EBUSY; @@ -1067,34 +1052,18 @@ static int pt_event_add(struct perf_event *event, int mode) if (pt->handle.event) goto fail; - buf = perf_aux_output_begin(&pt->handle, event); - ret = -EINVAL; - if (!buf) - goto fail_stop; - - pt_buffer_reset_offsets(buf, pt->handle.head); - if (!buf->snapshot) { - ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) - goto fail_end_stop; - } - if (mode & PERF_EF_START) { pt_event_start(event, 0); - ret = -EBUSY; + ret = -EINVAL; if (hwc->state == PERF_HES_STOPPED) - goto fail_end_stop; + goto fail; } else { hwc->state = PERF_HES_STOPPED; } - return 0; - -fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); -fail_stop: - hwc->state = PERF_HES_STOPPED; + ret = 0; fail: + return ret; } -- GitLab From 981a4cb380d3dff7010ce9f89618064a254eab8c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Mar 2016 15:42:49 +0200 Subject: [PATCH 038/705] perf/x86/intel/bts: Move transaction start/stop to start/stop callbacks As per AUX buffer management requirement, AUX output has to happen between pmu::start and pmu::stop calls so that perf_event_stop() actually stops it and therefore perf can free the AUX data after it has called pmu::stop. This patch moves perf_aux_output_{begin,end} from bts_event_{add,del} to bts_event_{start,stop}. As a bonus, we get rid of bts_buffer_is_full(), which is already taken care of by perf_aux_output_begin() anyway. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1457098969-21595-6-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 105 +++++++++++++++++------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index b99dc9258c0f9..0a6e393a2e629 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -171,18 +171,6 @@ static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) memset(page_address(phys->page) + index, 0, phys->size - index); } -static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= bts->handle.size || - bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) - return true; - - return false; -} - static void bts_update(struct bts_ctx *bts) { int cpu = raw_smp_processor_id(); @@ -213,18 +201,15 @@ static void bts_update(struct bts_ctx *bts) } } +static int +bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct bts_buffer *buf = perf_get_aux(&bts->handle); u64 config = 0; - if (!buf || bts_buffer_is_full(buf, bts)) - return; - - event->hw.itrace_started = 1; - event->hw.state = 0; - if (!buf->snapshot) config |= ARCH_PERFMON_EVENTSEL_INT; if (!event->attr.exclude_kernel) @@ -241,16 +226,41 @@ static void __bts_event_start(struct perf_event *event) wmb(); intel_pmu_enable_bts(config); + } static void bts_event_start(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf; + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + goto fail_stop; + + if (bts_buffer_reset(buf, &bts->handle)) + goto fail_end_stop; + + bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; + bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; + bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; + + event->hw.itrace_started = 1; + event->hw.state = 0; __bts_event_start(event); /* PMI handler: this counter is running and likely generating PMIs */ ACCESS_ONCE(bts->started) = 1; + + return; + +fail_end_stop: + perf_aux_output_end(&bts->handle, 0, false); + +fail_stop: + event->hw.state = PERF_HES_STOPPED; } static void __bts_event_stop(struct perf_event *event) @@ -269,15 +279,32 @@ static void __bts_event_stop(struct perf_event *event) static void bts_event_stop(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); /* PMI handler: don't restart this counter */ ACCESS_ONCE(bts->started) = 0; __bts_event_stop(event); - if (flags & PERF_EF_UPDATE) + if (flags & PERF_EF_UPDATE) { bts_update(bts); + + if (buf) { + if (buf->snapshot) + bts->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + } + + cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; + cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; + } } void intel_bts_enable_local(void) @@ -417,34 +444,14 @@ int intel_bts_interrupt(void) static void bts_event_del(struct perf_event *event, int mode) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); - bts_event_stop(event, PERF_EF_UPDATE); - - if (buf) { - if (buf->snapshot) - bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); - } - - cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; - cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; } static int bts_event_add(struct perf_event *event, int mode) { - struct bts_buffer *buf; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int ret = -EBUSY; event->hw.state = PERF_HES_STOPPED; @@ -454,26 +461,10 @@ static int bts_event_add(struct perf_event *event, int mode) if (bts->handle.event) return -EBUSY; - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return -EINVAL; - - ret = bts_buffer_reset(buf, &bts->handle); - if (ret) { - perf_aux_output_end(&bts->handle, 0, false); - return ret; - } - - bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; - bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; - bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - if (mode & PERF_EF_START) { bts_event_start(event, 0); - if (hwc->state & PERF_HES_STOPPED) { - bts_event_del(event, 0); - return -EBUSY; - } + if (hwc->state & PERF_HES_STOPPED) + return -EINVAL; } return 0; -- GitLab From 0a74c5b3d20d2a8693848b6ae4f1a97624f5b781 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2016 15:34:29 +0100 Subject: [PATCH 039/705] ftrace/perf: Check sample types only for sampling events Currently we check sample type for ftrace:function events even if it's not created as a sampling event. That prevents creating ftrace_function event in counting mode. Make sure we check sample types only for sampling events. Before: $ sudo perf stat -e ftrace:function ls ... Performance counter stats for 'ls': ftrace:function 0.001983662 seconds time elapsed After: $ sudo perf stat -e ftrace:function ls ... Performance counter stats for 'ls': 44,498 ftrace:function 0.037534722 seconds time elapsed Suggested-by: Namhyung Kim Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458138873-1553-2-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 00df25fd86ef4..e11108f1d1973 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!is_sampling_event(p_event)) + return 0; + /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page -- GitLab From 86e7972f690c1017fd086cdfe53d8524e68c661c Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Mon, 28 Mar 2016 06:41:29 +0000 Subject: [PATCH 040/705] perf/ring_buffer: Introduce new ioctl options to pause and resume the ring-buffer Add new ioctl() to pause/resume ring-buffer output. In some situations we want to read from the ring-buffer only when we ensure nothing can write to the ring-buffer during reading. Without this patch we have to turn off all events attached to this ring-buffer to achieve this. This patch is a prerequisite to enable overwrite support for the perf ring-buffer support. Following commits will introduce new methods support reading from overwrite ring buffer. Before reading, caller must ensure the ring buffer is frozen, or the reading is unreliable. Signed-off-by: Wang Nan Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Brendan Gregg Cc: He Kuang Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Zefan Li Link: http://lkml.kernel.org/r/1459147292-239310-2-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 13 +++++++++++++ kernel/events/internal.h | 9 +++++++++ kernel/events/ring_buffer.c | 12 +++++++++++- 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1afe9623c1a72..a3c19034d5f8d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -401,6 +401,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 243df4b628701..51386e84293e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4379,6 +4379,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2b229fdcfc099..2d67327d9ad98 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -17,6 +17,7 @@ struct ring_buffer { #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ + int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ @@ -64,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } +static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +{ + if (!pause && rb->nr_pages) + rb->paused = 0; + else + rb->paused = 1; +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 0ed4555309bd3..72d8127bb8fde 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -125,8 +125,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(!rb)) goto out; - if (unlikely(!rb->nr_pages)) + if (unlikely(rb->paused)) { + if (rb->nr_pages) + local_inc(&rb->lost); goto out; + } handle->rb = rb; handle->event = event; @@ -241,6 +244,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + + /* + * perf_output_begin() only checks rb->paused, therefore + * rb->paused must be true if we have no pages for output. + */ + if (!rb->nr_pages) + rb->paused = 1; } /* -- GitLab From 1879445dfa7bbd6fe21b09c5cc72f4934798afed Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Mon, 28 Mar 2016 06:41:30 +0000 Subject: [PATCH 041/705] perf/core: Set event's default ::overflow_handler() Set a default event->overflow_handler in perf_event_alloc() so don't need to check event->overflow_handler in __perf_event_overflow(). Following commits can give a different default overflow_handler. Initial idea comes from Peter: http://lkml.kernel.org/r/20130708121557.GA17211@twins.programming.kicks-ass.net Since the default value of event->overflow_handler is not NULL, existing 'if (!overflow_handler)' checks need to be changed. is_default_overflow_handler() is introduced for this. No extra performance overhead is introduced into the hot path because in the original code we still need to read this handler from memory. A conditional branch is avoided so actually we remove some instructions. Signed-off-by: Wang Nan Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Brendan Gregg Cc: He Kuang Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Zefan Li Link: http://lkml.kernel.org/r/1459147292-239310-3-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- arch/arm/kernel/hw_breakpoint.c | 4 ++-- arch/arm64/kernel/hw_breakpoint.c | 4 ++-- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 14 ++++++++------ 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 6284779d64ee6..b8df45883cf78 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) info->address &= ~alignment_mask; info->ctrl.len <<= offset; - if (!bp->overflow_handler) { + if (is_default_overflow_handler(bp)) { /* * Mismatch breakpoints are required for single-stepping * breakpoints. @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, * mismatch breakpoint so we can single-step over the * watchpoint trigger. */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) enable_single_step(wp, instruction_pointer(regs)); unlock: diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index b45c95d34b832..4ef5373f9a762 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (!bp->overflow_handler) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) step = 1; unlock: diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 15588d4c581de..4065ca2d71494 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -838,6 +838,12 @@ extern void perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); +static inline bool +is_default_overflow_handler(struct perf_event *event) +{ + return (event->overflow_handler == perf_event_output); +} + extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, diff --git a/kernel/events/core.c b/kernel/events/core.c index 51386e84293e3..8c3b35f2a2695 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6628,10 +6628,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -8152,8 +8149,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else { + event->overflow_handler = perf_event_output; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); -- GitLab From d1b26c70246bc72922ae61d9f972d5c2588409e7 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Mon, 28 Mar 2016 06:41:31 +0000 Subject: [PATCH 042/705] perf/ring_buffer: Prepare writing into the ring-buffer from the end Convert perf_output_begin() to __perf_output_begin() and make the later function able to write records from the end of the ring-buffer. Following commits will utilize the 'backward' flag. This is the core patch to support writing to the ring-buffer backwards, which will be introduced by upcoming patches to support reading from overwritable ring-buffers. In theory, this patch should not introduce any extra performance overhead since we use always_inline, but it does not hurt to double check that assumption: When CONFIG_OPTIMIZE_INLINING is disabled, the output object is nearly identical to original one. See: http://lkml.kernel.org/g/56F52E83.70409@huawei.com When CONFIG_OPTIMIZE_INLINING is enabled, the resuling object file becomes smaller: $ size kernel/events/ring_buffer.o* text data bss dec hex filename 4641 4 8 4653 122d kernel/events/ring_buffer.o.old 4545 4 8 4557 11cd kernel/events/ring_buffer.o.new Performance testing results: Calling 3000000 times of 'close(-1)', use gettimeofday() to check duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture system calls. In ns. Testing environment: CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz Kernel : v4.5.0 MEAN STDVAR BASE 800214.950 2853.083 PRE 2253846.700 9997.014 POST 2257495.540 8516.293 Where 'BASE' is pure performance without capturing. 'PRE' is test result of pure 'v4.5.0' kernel. 'POST' is test result after this patch. Considering the stdvar, this patch doesn't hurt performance, within noise margin. For testing details, see: http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com Signed-off-by: Wang Nan Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Brendan Gregg Cc: He Kuang Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Zefan Li Link: http://lkml.kernel.org/r/1459147292-239310-4-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 42 +++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 72d8127bb8fde..60be55a640408 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -102,8 +102,21 @@ static void perf_output_put_handle(struct perf_output_handle *handle) preempt_enable(); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) +static bool __always_inline +ring_buffer_has_space(unsigned long head, unsigned long tail, + unsigned long data_size, unsigned int size, + bool backward) +{ + if (!backward) + return CIRC_SPACE(head, tail, data_size) >= size; + else + return CIRC_SPACE(tail, head, data_size) >= size; +} + +static int __always_inline +__perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -146,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; + if (!rb->overwrite) { + if (unlikely(!ring_buffer_has_space(head, tail, + perf_data_size(rb), + size, backward))) + goto fail; + } /* * The above forms a control dependency barrier separating the @@ -162,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle, * See perf_output_put_handle(). */ - head += size; + if (!backward) + head += size; + else + head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + if (backward) { + offset = head; + head = (u64)(-head); + } + /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. @@ -206,6 +230,12 @@ int perf_output_begin(struct perf_output_handle *handle, return -ENOSPC; } +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { -- GitLab From 5ca3726af7f66a8cc71ce4414cfeb86deb784491 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 22 Mar 2016 16:37:07 +0800 Subject: [PATCH 043/705] sched/cpuacct: Show all possible CPUs in cpuacct output Current code show stats of online CPUs in cpuacct.statcpus, show stats of present cpus in cpuacct.usage(_percpu), and using present CPUs for setting cpuacct.usage. It will cause inconsistent result when a CPU is online or offline or hotpluged. We should always use possible CPUs to avoid above problem. Here are the contents of a cpuacct.usage_percpu sysfs file, on a 4 CPU system with maxcpus=32: Before the patch: # cat cpuacct.usage_percpu 2456565 411435 1052897 832584 After the patch: # cat cpuacct.usage_percpu 2456565 411435 1052897 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Suggested-by: Peter Zijlstra Signed-off-by: Zhao Lei Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a11d56cef12d0b4807f8be3a46bf9798c3014d59.1458635566.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a811203c04a4..e39bd4faf2a8b 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -138,7 +138,7 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_present_cpu(i) + for_each_possible_cpu(i) totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; @@ -159,7 +159,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, goto out; } - for_each_present_cpu(i) + for_each_possible_cpu(i) cpuacct_cpuusage_write(ca, i, 0); out: @@ -172,7 +172,7 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) u64 percpu; int i; - for_each_present_cpu(i) { + for_each_possible_cpu(i) { percpu = cpuacct_cpuusage_read(ca, i); seq_printf(m, "%llu ", (unsigned long long) percpu); } @@ -191,7 +191,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) int cpu; s64 val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_USER]; val += kcpustat->cpustat[CPUTIME_NICE]; @@ -200,7 +200,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_SYSTEM]; val += kcpustat->cpustat[CPUTIME_IRQ]; -- GitLab From d740037fac7052e49450f6fa1454f1144a103b55 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 22 Mar 2016 16:37:08 +0800 Subject: [PATCH 044/705] sched/cpuacct: Split usage accounting into user_usage and sys_usage Sometimes, cpuacct.usage is not detailed enough to see how much CPU usage a group had. We want to know how much time it used in user mode and how much in kernel mode. This patch introduces more files to give this information: # ls /sys/fs/cgroup/cpuacct/cpuacct.usage* /sys/fs/cgroup/cpuacct/cpuacct.usage /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu /sys/fs/cgroup/cpuacct/cpuacct.usage_user /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu_user /sys/fs/cgroup/cpuacct/cpuacct.usage_sys /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu_sys ... while keeping the ABI with the existing counter. Signed-off-by: Dongsheng Yang [ Ported to newer kernels. ] Signed-off-by: Zhao Lei Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aa171da036b520b51c79549e9b3215d29473f19d.1458635566.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 140 +++++++++++++++++++++++++++++++++-------- 1 file changed, 113 insertions(+), 27 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index e39bd4faf2a8b..df947e07aac1d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,11 +25,22 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; +enum cpuacct_usage_index { + CPUACCT_USAGE_USER, /* ... user mode */ + CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ + + CPUACCT_USAGE_NRUSAGE, +}; + +struct cpuacct_usage { + u64 usages[CPUACCT_USAGE_NRUSAGE]; +}; + /* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; + struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(u64); + ca->cpuusage = alloc_percpu(struct cpuacct_usage); if (!ca->cpuusage) goto out_free_ca; @@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) kfree(ca); } -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, + enum cpuacct_usage_index index) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; + /* + * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * the sum of suages. + */ + BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; +#endif + + if (index == CPUACCT_USAGE_NRUSAGE) { + int i = 0; + + data = 0; + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + data += cpuusage->usages[i]; + } else { + data = cpuusage->usages[index]; + } + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; #endif return data; @@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + int i; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; +#endif + + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + cpuusage->usages[i] = val; + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; #endif } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +static u64 __cpuusage_read(struct cgroup_subsys_state *css, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; for_each_possible_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); + totalcpuusage += cpuacct_cpuusage_read(ca, i, index); return totalcpuusage; } +static u64 cpuusage_user_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_USER); +} + +static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); +} + +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); +} + static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct cpuacct *ca = css_ca(css); - int err = 0; - int i; + int cpu; /* * Only allow '0' here to do a reset. */ - if (val) { - err = -EINVAL; - goto out; - } + if (val) + return -EINVAL; - for_each_possible_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); + for_each_possible_cpu(cpu) + cpuacct_cpuusage_write(ca, cpu, 0); -out: - return err; + return 0; } -static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +static int __cpuacct_percpu_seq_show(struct seq_file *m, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; for_each_possible_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); + percpu = cpuacct_cpuusage_read(ca, i, index); seq_printf(m, "%llu ", (unsigned long long) percpu); } seq_printf(m, "\n"); return 0; } +static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); +} + +static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); +} + static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_USER] = "user", [CPUACCT_STAT_SYSTEM] = "system", @@ -219,10 +281,26 @@ static struct cftype files[] = { .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_user", + .read_u64 = cpuusage_user_read, + }, + { + .name = "usage_sys", + .read_u64 = cpuusage_sys_read, + }, { .name = "usage_percpu", .seq_show = cpuacct_percpu_seq_show, }, + { + .name = "usage_percpu_user", + .seq_show = cpuacct_percpu_user_seq_show, + }, + { + .name = "usage_percpu_sys", + .seq_show = cpuacct_percpu_sys_seq_show, + }, { .name = "stat", .seq_show = cpuacct_stats_show, @@ -238,10 +316,18 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int index; + + if (user_mode(task_pt_regs(tsk))) + index = CPUACCT_USAGE_USER; + else + index = CPUACCT_USAGE_SYSTEM; rcu_read_lock(); + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - *this_cpu_ptr(ca->cpuusage) += cputime; + this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + rcu_read_unlock(); } -- GitLab From d02c071183e1c01a76811c878c8a52322201f81f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 23 Mar 2016 17:54:44 +0530 Subject: [PATCH 045/705] sched/fair: Reset nr_balance_failed after active balancing To force a task migration during active balancing, nr_balance_failed is set to cache_nice_tries + 1. However nr_balance_failed is not reset. As a side effect, the next regular load balance under the same sd, a cache hot task might be migrated, just because nr_balance_failed count is high. Resetting nr_balance_failed after a successful active balance ensures that a hot task is not unreasonably migrated. This can be verified by looking at othe number of hot task migrations reported by /proc/schedstat. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458735884-30105-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fe30e66aff1d..cbb075e46b2c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7399,10 +7399,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, &busiest->active_balance_work); } - /* - * We've kicked active balancing, reset the failure - * counter. - */ + /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else @@ -7637,10 +7634,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + /* Active balancing done, reset the failure counter. */ + sd->nr_balance_failed = 0; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: -- GitLab From bfdb198ccd99472c5bded689699eb30dd06316bb Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Mon, 1 Feb 2016 14:47:59 -0800 Subject: [PATCH 046/705] sched/numa: Remove unnecessary NUMA dequeue update from non-SMP kernels In account_entity_enqueue(), we do not do account_numa_enqueue() as NUMA balancing is not needed for UP kernels. Hence, we should remove the account_numa_dequeue() call from account_entity_dequeue() for UP kernels. Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Rik van Riel Cc: Mel Gorman Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454366879.21738.29.camel@schen9-desk2.jf.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbb075e46b2c2..1fe9916b45577 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2437,10 +2437,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } +#endif cfs_rq->nr_running--; } -- GitLab From 47252cfbac03644ee4a3adfa50c77896aa94f2bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Mar 2016 11:23:39 -0400 Subject: [PATCH 047/705] sched/core: Add preempt checks in preempt_schedule() code While testing the tracer preemptoff, I hit this strange trace: <...>-259 0...1 0us : schedule <-worker_thread <...>-259 0d..1 0us : rcu_note_context_switch <-__schedule <...>-259 0d..1 0us : rcu_sched_qs <-rcu_note_context_switch <...>-259 0d..1 0us : rcu_preempt_qs <-rcu_note_context_switch <...>-259 0d..1 0us : _raw_spin_lock <-__schedule <...>-259 0d..1 0us : preempt_count_add <-_raw_spin_lock <...>-259 0d..2 0us : do_raw_spin_lock <-_raw_spin_lock <...>-259 0d..2 1us : deactivate_task <-__schedule <...>-259 0d..2 1us : update_rq_clock.part.84 <-deactivate_task <...>-259 0d..2 1us : dequeue_task_fair <-deactivate_task <...>-259 0d..2 1us : dequeue_entity <-dequeue_task_fair <...>-259 0d..2 1us : update_curr <-dequeue_entity <...>-259 0d..2 1us : update_min_vruntime <-update_curr <...>-259 0d..2 1us : cpuacct_charge <-update_curr <...>-259 0d..2 1us : __rcu_read_lock <-cpuacct_charge <...>-259 0d..2 1us : __rcu_read_unlock <-cpuacct_charge <...>-259 0d..2 1us : clear_buddies <-dequeue_entity <...>-259 0d..2 1us : account_entity_dequeue <-dequeue_entity <...>-259 0d..2 2us : update_min_vruntime <-dequeue_entity <...>-259 0d..2 2us : update_cfs_shares <-dequeue_entity <...>-259 0d..2 2us : hrtick_update <-dequeue_task_fair <...>-259 0d..2 2us : wq_worker_sleeping <-__schedule <...>-259 0d..2 2us : kthread_data <-wq_worker_sleeping <...>-259 0d..2 2us : pick_next_task_fair <-__schedule <...>-259 0d..2 2us : check_cfs_rq_runtime <-pick_next_task_fair <...>-259 0d..2 2us : pick_next_entity <-pick_next_task_fair <...>-259 0d..2 2us : clear_buddies <-pick_next_entity <...>-259 0d..2 2us : pick_next_entity <-pick_next_task_fair <...>-259 0d..2 2us : clear_buddies <-pick_next_entity <...>-259 0d..2 2us : set_next_entity <-pick_next_task_fair <...>-259 0d..2 3us : put_prev_entity <-pick_next_task_fair <...>-259 0d..2 3us : check_cfs_rq_runtime <-put_prev_entity <...>-259 0d..2 3us : set_next_entity <-pick_next_task_fair gnome-sh-1031 0d..2 3us : finish_task_switch <-__schedule gnome-sh-1031 0d..2 3us : _raw_spin_unlock_irq <-finish_task_switch gnome-sh-1031 0d..2 3us : do_raw_spin_unlock <-_raw_spin_unlock_irq gnome-sh-1031 0...2 3us!: preempt_count_sub <-_raw_spin_unlock_irq gnome-sh-1031 0...1 582us : do_raw_spin_lock <-_raw_spin_lock gnome-sh-1031 0...1 583us : _raw_spin_unlock <-drm_gem_object_lookup gnome-sh-1031 0...1 583us : do_raw_spin_unlock <-_raw_spin_unlock gnome-sh-1031 0...1 583us : preempt_count_sub <-_raw_spin_unlock gnome-sh-1031 0...1 584us : _raw_spin_unlock <-drm_gem_object_lookup gnome-sh-1031 0...1 584us+: trace_preempt_on <-drm_gem_object_lookup gnome-sh-1031 0...1 603us : => preempt_count_sub => _raw_spin_unlock => drm_gem_object_lookup => i915_gem_madvise_ioctl => drm_ioctl => do_vfs_ioctl => SyS_ioctl => entry_SYSCALL_64_fastpath As I'm tracing preemption disabled, it seemed incorrect that the trace would go across a schedule and report not being in the scheduler. Looking into this I discovered the problem. schedule() calls preempt_disable() but the preempt_schedule() calls preempt_enable_notrace(). What happened above was that the gnome-shell task was preempted on another CPU, migrated over to the idle cpu. The tracer stared with idle calling schedule(), which called preempt_disable(), but then gnome-shell finished, and it enabled preemption with preempt_enable_notrace() that does stop the trace, even though preemption was enabled. The purpose of the preempt_disable_notrace() in the preempt_schedule() is to prevent function tracing from going into an infinite loop. Because function tracing can trace the preempt_enable/disable() calls that are traced. The problem with function tracing is: NEED_RESCHED set preempt_schedule() preempt_disable() preempt_count_inc() function trace (before incrementing preempt count) preempt_disable_notrace() preempt_enable_notrace() sees NEED_RESCHED set preempt_schedule() (repeat) Now by breaking out the preempt off/on tracing into their own code: preempt_disable_check() and preempt_enable_check(), we can add these to the preempt_schedule() code. As preemption would then be disabled, even if they were to be traced by the function tracer, the disabled preemption would prevent the recursion. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160321112339.6dc78ad6@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 68 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b489fcac37bd..b5cf01d2368e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2958,6 +2958,20 @@ u64 scheduler_tick_max_deferment(void) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} void preempt_count_add(int val) { @@ -2976,17 +2990,21 @@ void preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3003,13 +3021,15 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@ -3284,8 +3304,23 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@ -3337,7 +3372,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3347,6 +3396,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } -- GitLab From 1c3de5e19fc96206dd086e634129d08e5f7b1000 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 30 Mar 2016 07:07:51 +0800 Subject: [PATCH 048/705] sched/fair: Update comments after a variable rename The following commit: ed82b8a1ff76 ("sched/core: Move the sched_to_prio[] arrays out of line") renamed prio_to_weight to sched_prio_to_weight, but the old name was not updated in comments. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459292871-22531-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fe9916b45577..4bb5ace60dc84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw) * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * - * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * @@ -5656,7 +5656,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight - * is derived from the nice value as per prio_to_weight[]. + * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: -- GitLab From 2b8c41daba327c633228169e8bd8ec067ab443f8 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 30 Mar 2016 04:30:56 +0800 Subject: [PATCH 049/705] sched/fair: Initiate a new task's util avg to a bounded value A new task's util_avg is set to full utilization of a CPU (100% time running). This accelerates a new task's utilization ramp-up, useful to boost its execution in early time. However, it may result in (insanely) high utilization for a transient time period when a flood of tasks are spawned. Importantly, it violates the "fundamentally bounded" CPU utilization, and its side effect is negative if we don't take any measure to bound it. This patch proposes an algorithm to address this issue. It has two methods to approach a sensible initial util_avg: (1) An expected (or average) util_avg based on its cfs_rq's util_avg: util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight (2) A trajectory of how successive new tasks' util develops, which gives 1/2 of the left utilization budget to a new task such that the additional util is noticeably large (when overall util is low) or unnoticeably small (when overall util is high enough). In the meantime, the aggregate utilization is well bounded: util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n where n denotes the nth task. If util_avg is larger than util_avg_cap, then the effective util is clamped to the util_avg_cap. Reported-by: Andrey Ryabinin Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Link: http://lkml.kernel.org/r/1459283456-21682-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 1 + 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5cf01d2368e3..11594230ef4de 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2431,6 +2431,8 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Post initialize new task's util average when its cfs_rq is set */ + post_init_entity_util_avg(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4bb5ace60dc84..b8cc1c35cd7c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } +void post_init_entity_util_avg(struct sched_entity *se) +{ +} #endif /* @@ -8384,6 +8435,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + post_init_entity_util_avg(se); } return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e6..a7cbad7b3ad28 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1313,6 +1313,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); -- GitLab From 568a58e5dfbcb88011cad7f87ed046aa00f19d1a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:42:01 +0200 Subject: [PATCH 050/705] x86/mm/pat, x86/cpufeature: Remove cpu_has_pat Signed-off-by: Borislav Petkov Acked-by: Daniel Vetter Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Link: http://lkml.kernel.org/r/1459266123-21878-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - drivers/gpu/drm/i915/i915_gem.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3636ec06c8875..7a3fa7d70bd77 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -132,7 +132,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) -#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3d31d3ac589e8..aaec8aef9fd4c 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1732,7 +1732,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, if (args->flags & ~(I915_MMAP_WC)) return -EINVAL; - if (args->flags & I915_MMAP_WC && !cpu_has_pat) + if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT)) return -ENODEV; obj = drm_gem_object_lookup(dev, file, args->handle); -- GitLab From 7b5e74e637e4a977c7cf40fd7de332f60b68180e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:54 +0200 Subject: [PATCH 051/705] x86/cpufeature: Remove cpu_has_arch_perfmon Use boot_cpu_has() instead. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: oprofile-list@lists.sf.net Link: http://lkml.kernel.org/r/1459266123-21878-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/oprofile/nmi_int.c | 4 ++-- arch/x86/oprofile/op_model_ppro.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3636ec06c8875..fee7a6efcd2da 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -131,7 +131,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) -#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0e07e0968c3a0..25171e9595f74 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -636,7 +636,7 @@ static int __init ppro_init(char **cpu_type) __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) + if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return 0; /* @@ -761,7 +761,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (cpu_type) break; - if (!cpu_has_arch_perfmon) + if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return -ENODEV; /* use arch perfmon as fallback */ diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index d90528ea54120..350f7096baac8 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -75,7 +75,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; - if (cpu_has_arch_perfmon) { + if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); -- GitLab From 0c9f3536cc712dfd5ec3127d55cd7b807cc0adb5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:55 +0200 Subject: [PATCH 052/705] x86/cpufeature: Remove cpu_has_hypervisor Use boot_cpu_has() instead. Tested-by: David Kershner Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sparmaintainer@unisys.com Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/1459266123-21878-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/cpu/vmware.c | 2 +- arch/x86/kernel/kvm.c | 2 +- drivers/staging/unisys/visorbus/visorchipset.c | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7012d18bb2930..3f6d8b5672d5c 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1383,7 +1383,7 @@ static int __init intel_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; max_packages = topology_max_packages(); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index fee7a6efcd2da..3aea54ecabfda 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -136,7 +136,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) -#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) /* * Do not add any more of those clumsy macros - use static_cpu_has() for * fast paths and boot_cpu_has() otherwise! diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 364e583468975..8cac429b6a1d5 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -94,7 +94,7 @@ static void __init vmware_platform_setup(void) */ static uint32_t __init vmware_platform(void) { - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { unsigned int eax; unsigned int hyper_vendor_id[3]; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 807950860fb70..dc1207e2f1939 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -522,7 +522,7 @@ static noinline uint32_t __kvm_cpuid_base(void) if (boot_cpu_data.cpuid_level < 0) return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); return 0; diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c index 5fbda7b218c7a..9cf4f8463c4e6 100644 --- a/drivers/staging/unisys/visorbus/visorchipset.c +++ b/drivers/staging/unisys/visorbus/visorchipset.c @@ -2425,7 +2425,7 @@ static __init uint32_t visorutil_spar_detect(void) { unsigned int eax, ebx, ecx, edx; - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { /* check the ID */ cpuid(UNISYS_SPAR_LEAF_ID, &eax, &ebx, &ecx, &edx); return (ebx == UNISYS_SPAR_ID_EBX) && -- GitLab From ab4a56fa2c6ce9384ca077b6570c56fe18361f17 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:56 +0200 Subject: [PATCH 053/705] x86/cpufeature: Remove cpu_has_osxsave Use boot_cpu_has() instead. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/1459266123-21878-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 3 ++- arch/x86/crypto/camellia_aesni_avx_glue.c | 2 +- arch/x86/crypto/serpent_avx2_glue.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/xor_avx.h | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index d844569245633..c37f7028c85ae 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,8 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 93d8f295784e3..65f64556725b2 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,7 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!cpu_has_avx || !cpu_has_aes || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 6d198342e2de4..408cae2b35438 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -538,7 +538,7 @@ static int __init init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_osxsave) { + if (!cpu_has_avx2 || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3aea54ecabfda..33c29aabc9aaf 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -135,7 +135,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) -#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) /* * Do not add any more of those clumsy macros - use static_cpu_has() for * fast paths and boot_cpu_has() otherwise! diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7c0a517ec7511..e45e556140af9 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx && cpu_has_osxsave) \ + if (cpu_has_avx && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) + (cpu_has_avx && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) #else -- GitLab From 62436a4d36c94d202784cd8a997ff8bb4b880237 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:57 +0200 Subject: [PATCH 054/705] x86/cpufeature: Remove cpu_has_x2apic Signed-off-by: Borislav Petkov Acked-by: Tony Luck Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459266123-21878-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/iommu.h | 1 - arch/x86/include/asm/apic.h | 4 ++-- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/apic/apic.c | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 105c93b00b1bc..1d1212901ae70 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -1,7 +1,6 @@ #ifndef _ASM_IA64_IOMMU_H #define _ASM_IA64_IOMMU_H 1 -#define cpu_has_x2apic 0 /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98f25bbafac4c..bc27611fa58f1 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -239,10 +239,10 @@ extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { - return cpu_has_x2apic && apic_is_x2apic_enabled(); + return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } -#define x2apic_supported() (cpu_has_x2apic) +#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 33c29aabc9aaf..3da7aec9fca13 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -132,7 +132,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d356987a04e97..d7867c885bf8a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1561,7 +1561,7 @@ void __init check_x2apic(void) pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; x2apic_state = X2APIC_ON; - } else if (!cpu_has_x2apic) { + } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } } -- GitLab From b8291adc191abec2095f03a130ac91506d345cae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:58 +0200 Subject: [PATCH 055/705] x86/cpufeature: Remove cpu_has_gbpages Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459266123-21878-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kvm/mmu.c | 3 ++- arch/x86/mm/hugetlbpage.c | 4 ++-- arch/x86/mm/init.c | 2 +- arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/pageattr.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3da7aec9fca13..693b4aa439085 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -130,7 +130,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) -#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70e95d097ef10..bc1e0b65909e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3836,7 +3836,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, - cpu_has_gbpages, true, true); + boot_cpu_has(X86_FEATURE_GBPAGES), + true, true); else __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, boot_cpu_data.x86_phys_bits, diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 740d7ac03a552..14a95054d4e05 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -162,7 +162,7 @@ static __init int setup_hugepagesz(char *opt) unsigned long ps = memparse(opt, &opt); if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && cpu_has_gbpages) { + } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", @@ -177,7 +177,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9d56f271d5195..14377e98f2798 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -173,7 +173,7 @@ static void __init probe_page_size_mask(void) __supported_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ - if (direct_gbpages && cpu_has_gbpages) { + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0d8d53d1f5cc2..5a116ace9cbbe 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -378,7 +378,7 @@ EXPORT_SYMBOL(iounmap); int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 - return cpu_has_gbpages; + return boot_cpu_has(X86_FEATURE_GBPAGES); #else return 0; #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf792..fb20c2ee00922 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (cpu_has_gbpages && end - start >= PUD_SIZE) { + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); -- GitLab From 906bf7fda2c9cf5c1762ec607943ed54b6c5b203 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:41:59 +0200 Subject: [PATCH 056/705] x86/cpufeature: Remove cpu_has_clflush Use the fast variant in the DRM code. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Cc: intel-gfx@lists.freedesktop.org Link: http://lkml.kernel.org/r/1459266123-21878-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/tce_64.c | 2 +- arch/x86/mm/pageattr.c | 2 +- drivers/gpu/drm/drm_cache.c | 6 +++--- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- 6 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 693b4aa439085..a75154232db52 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -129,7 +129,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) -#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1f7fdb91a818b..628a9f853b84c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -468,7 +468,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && cpu_has_clflush && + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index ab40954e113e9..f386bad0984ed 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -40,7 +40,7 @@ static inline void flush_tce(void* tceaddr) { /* a single tce can't cross a cache line */ - if (cpu_has_clflush) + if (boot_cpu_has(X86_FEATURE_CLFLUSH)) clflush(tceaddr); else wbinvd(); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fb20c2ee00922..bbf462ff9745c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1460,7 +1460,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * error case we fall back to cpa_flush_all (which uses * WBINVD): */ - if (!ret && cpu_has_clflush) { + if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index 6743ff7dccfa3..059f7c39c5828 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -72,7 +72,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { drm_cache_flush_clflush(pages, num_pages); return; } @@ -105,7 +105,7 @@ void drm_clflush_sg(struct sg_table *st) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { struct sg_page_iter sg_iter; mb(); @@ -129,7 +129,7 @@ void drm_clflush_virt_range(void *addr, unsigned long length) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { const int size = boot_cpu_data.x86_clflush_size; void *end = addr + length; addr = (void *)(((unsigned long)addr) & -size); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 1328bc5021b4c..b845f468dd74f 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -488,7 +488,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, ret = relocate_entry_cpu(obj, reloc, target_offset); else if (obj->map_and_fenceable) ret = relocate_entry_gtt(obj, reloc, target_offset); - else if (cpu_has_clflush) + else if (static_cpu_has(X86_FEATURE_CLFLUSH)) ret = relocate_entry_clflush(obj, reloc, target_offset); else { WARN_ONCE(1, "Impossible case in relocation handling\n"); -- GitLab From 054efb6467f84490bdf92afab6d9dbd5102e620a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:42:00 +0200 Subject: [PATCH 057/705] x86/cpufeature: Remove cpu_has_xmm2 Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/1459266123-21878-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/crypto/poly1305_glue.c | 2 +- arch/x86/crypto/serpent_sse2_glue.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/lib/usercopy_32.c | 4 ++-- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4264a3d595894..b283868acdf85 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -179,7 +179,7 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!cpu_has_xmm2) + if (!boot_cpu_has(X86_FEATURE_XMM2)) return -ENODEV; #ifdef CONFIG_AS_AVX2 diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8943407e8917a..644f97ab8cace 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -600,7 +600,7 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_sse2_init(void) { - if (!cpu_has_xmm2) { + if (!boot_cpu_has(X86_FEATURE_XMM2)) { printk(KERN_INFO "SSE2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a75154232db52..5e02bc2e8444f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,7 +125,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) -#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6e47e3a916f12..ea8f88a2a688b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -750,7 +750,7 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) { + if (cpu_has(c, X86_FEATURE_XMM2)) { /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 628a9f853b84c..1dba36fe73e5d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -456,7 +456,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - if (cpu_has_xmm2) + if (cpu_has(c, X86_FEATURE_XMM2)) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (boot_cpu_has(X86_FEATURE_DS)) { diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 91d93b95bd868..b559d92387813 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -612,7 +612,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_zeroing_intel_nocache(to, from, n); else __copy_user_zeroing(to, from, n); @@ -629,7 +629,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_intel_nocache(to, from, n); else __copy_user(to, from, n); -- GitLab From c109bf95992b391bb40bc37c5d309d13fead99b5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:42:02 +0200 Subject: [PATCH 058/705] x86/cpufeature: Remove cpu_has_pge Use static_cpu_has() in __flush_tlb_all() due to the time-sensitivity of this one. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459266123-21878-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/kernel/cpu/intel.c | 6 +++--- arch/x86/kernel/cpu/mtrr/cyrix.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/mm/init.c | 2 +- arch/x86/xen/enlighten.c | 2 +- drivers/lguest/x86/core.c | 2 +- 8 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 5e02bc2e8444f..f97b53417d44c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -121,7 +121,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) #define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) -#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c24b4224d4392..3628e6c5ebf4c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -181,7 +181,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (cpu_has_pge) + if (static_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1dba36fe73e5d..f71a34944b560 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -152,9 +152,9 @@ static void early_init_intel(struct cpuinfo_x86 *c) * the TLB when any changes are made to any of the page table entries. * The operating system must reload CR3 to cause the TLB to be flushed" * - * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should - * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE - * to be modified + * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h + * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { pr_info("Disabling PGE capability bit\n"); diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index f8c81ba0b4651..b1086f79e57e4 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -137,7 +137,7 @@ static void prepare_set(void) u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -170,7 +170,7 @@ static void post_set(void) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 19f57360dfd25..f1bed301bdb27 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -741,7 +741,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -771,7 +771,7 @@ static void post_set(void) __releases(set_atomicity_lock) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 14377e98f2798..05ff46a9c2619 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -166,7 +166,7 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c7d9ddb..055f48ddb03ca 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1472,7 +1472,7 @@ static void xen_pvh_set_cr_flags(int cpu) if (cpu_has_pse) cr4_set_bits_and_update_boot(X86_CR4_PSE); - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) cr4_set_bits_and_update_boot(X86_CR4_PGE); } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 6a4cd771a2be6..65f22debf3c65 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -599,7 +599,7 @@ void __init lguest_arch_host_init(void) * doing this. */ get_online_cpus(); - if (cpu_has_pge) { /* We have a broader idea of "global". */ + if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; /* -- GitLab From 16bf92261b1b6cb1a1c0671b445a2fcb5a1ecc96 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Mar 2016 17:42:03 +0200 Subject: [PATCH 059/705] x86/cpufeature: Remove cpu_has_pse Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459266123-21878-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 4 ++-- arch/x86/mm/ioremap.c | 2 +- arch/x86/power/hibernate_32.c | 2 +- arch/x86/xen/enlighten.c | 2 +- 8 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f97b53417d44c..97e5f13ea4713 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -119,7 +119,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 97f3242e133cc..f86491a7bc9dd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -183,7 +183,7 @@ static inline int pmd_trans_huge(pmd_t pmd) static inline int has_transparent_hugepage(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } #ifdef __HAVE_ARCH_PTE_DEVMAP diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 05ff46a9c2619..372aad2b32910 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,12 +157,12 @@ static void __init probe_page_size_mask(void) * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse && !debug_pagealloc_enabled()) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif /* Enable PSE if available */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd7a9b9e2e14a..85af914e3d275 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -284,7 +284,7 @@ kernel_physical_mapping_init(unsigned long start, */ mapping_iter = 1; - if (!cpu_has_pse) + if (!boot_cpu_has(X86_FEATURE_PSE)) use_pse = 0; repeat: diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 214afda979114..89d97477c1d92 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1295,7 +1295,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) struct vmem_altmap *altmap = to_vmem_altmap(start); int err; - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", @@ -1338,7 +1338,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (!cpu_has_pse) { + if (!boot_cpu_has(X86_FEATURE_PSE)) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 5a116ace9cbbe..f0894910bdd73 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -386,7 +386,7 @@ int __init arch_ioremap_pud_supported(void) int __init arch_ioremap_pmd_supported(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } /* diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 291226b952a99..9f14bd34581d6 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -106,7 +106,7 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) * normal page tables. * NOTE: We can mark everything as executable here */ - if (cpu_has_pse) { + if (boot_cpu_has(X86_FEATURE_PSE)) { set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); pfn += PTRS_PER_PTE; } else { diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 055f48ddb03ca..ff2a2e6ef7af9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1469,7 +1469,7 @@ static void xen_pvh_set_cr_flags(int cpu) * For BSP, PSE PGE are set in probe_page_size_mask(), for APs * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); if (boot_cpu_has(X86_FEATURE_PGE)) -- GitLab From 8fad7ec51e1b9e262e0bdd34e800ac1ea5e84dec Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sat, 26 Mar 2016 21:40:16 +0100 Subject: [PATCH 060/705] x86/dumpstack: Combine some printk()s Long ago, Jiri Slaby noted that the subsequent printk()s should be pr_cont(). Let's instead get rid of the multiple printk calls. Signed-off-by: Rasmus Villemoes Cc: Jiri Slaby Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459024817-27122-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 8efa57a5f29ea..2bb25c3fe2e8e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -260,19 +260,12 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif - if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_KASAN - printk("KASAN"); -#endif - printk("\n"); + "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; -- GitLab From 3ed5ca2efff70e9f589087c2013789572901112d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 Mar 2016 16:51:17 -0300 Subject: [PATCH 061/705] perf trace: Do not process PERF_RECORD_LOST twice We catch this record to provide a visual indication that events are getting lost, then call the default method to allow extra logging shared with the other tools to take place. This extra logging was done twice because we were continuing to the "default" clause where machine__process_event() will end up calling machine__process_lost_event() again, fix it. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wus2zlhw3qo24ye84ewu4aqw@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 93ac724fb635c..6485576f3337c 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1618,6 +1618,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, color_fprintf(trace->output, PERF_COLOR_RED, "LOST %" PRIu64 " events!\n", event->lost.lost); ret = machine__process_lost_event(machine, event, sample); + break; default: ret = machine__process_event(machine, event, sample); break; -- GitLab From 997bba8cf1875d9715e792c445e1a9c7a4c365e2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 Mar 2016 19:43:32 -0300 Subject: [PATCH 062/705] perf trace: Pretty print seccomp() args E.g: # trace -e seccomp 200.061 (0.009 ms): :2441/2441 seccomp(op: FILTER, flags: TSYNC ) = -1 EFAULT Bad address 200.910 (0.121 ms): :2441/2441 seccomp(op: FILTER, flags: TSYNC, uargs: 0x7fff57479fe0) = 0 Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-t369uckshlwp4evkks4bcoo7@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 6485576f3337c..0c8bcb94934e0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -40,6 +40,10 @@ #include #include #include +#include +#include +#include +#include /* For older distros: */ #ifndef MAP_STACK @@ -1001,6 +1005,46 @@ static const char *tioctls[] = { static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); #endif /* defined(__i386__) || defined(__x86_64__) */ +static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg) +{ + int op = arg->val; + size_t printed = 0; + + switch (op) { +#define P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break + P_SECCOMP_SET_MODE_OP(STRICT); + P_SECCOMP_SET_MODE_OP(FILTER); +#undef P_SECCOMP_SET_MODE_OP + default: printed = scnprintf(bf, size, "%#x", op); break; + } + + return printed; +} + +#define SCA_SECCOMP_OP syscall_arg__scnprintf_seccomp_op + +static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_FLAG(n) \ + if (flags & SECCOMP_FILTER_FLAG_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~SECCOMP_FILTER_FLAG_##n; \ + } + + P_FLAG(TSYNC); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags + #define STRARRAY(arg, name, array) \ .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } @@ -1234,6 +1278,9 @@ static struct syscall_fmt { .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, { .name = "rt_tgsigqueueinfo", .errmsg = true, .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, + { .name = "seccomp", .errmsg = true, + .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ + [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, { .name = "select", .errmsg = true, .timeout = true, }, { .name = "sendmmsg", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ -- GitLab From 39878d492c049796202b70dc0ef14449cafa3cb4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 Mar 2016 20:02:15 -0300 Subject: [PATCH 063/705] perf trace: Pretty print getrandom() args # trace -e getrandom 35622.560 ( 0.023 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35622.585 ( 0.006 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35622.594 ( 0.004 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35627.395 ( 0.010 ms): libvirtd/1353 getrandom(buf: 0x7f7a1bfa35c0, count: 16, flags: NONBLOCK ) = 16 35630.940 ( 0.013 ms): fwupd/16120 getrandom(buf: 0x7f63243aa5c0, count: 16, flags: NONBLOCK ) = 16 35718.613 ( 0.015 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35718.629 ( 0.005 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35718.637 ( 0.004 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 35719.355 ( 0.010 ms): libvirtd/1353 getrandom(buf: 0x7f7a1bfa35c0, count: 16, flags: NONBLOCK ) = 16 35721.042 ( 0.030 ms): fwupd/16120 getrandom(buf: 0x7f63243aa5c0, count: 16, flags: NONBLOCK ) = 16 41090.830 ( 0.012 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41090.845 ( 0.004 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41090.851 ( 0.004 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41091.750 ( 0.010 ms): libvirtd/1353 getrandom(buf: 0x7f7a1bfa35c0, count: 16, flags: NONBLOCK ) = 16 41091.823 ( 0.006 ms): fwupd/16120 getrandom(buf: 0x7f63243aa5c0, count: 16, flags: NONBLOCK ) = 16 41122.078 ( 0.053 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41122.129 ( 0.009 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41122.139 ( 0.004 ms): systemd-udevd/631 getrandom(buf: 0x55621e3c18f0, count: 16, flags: NONBLOCK) = 16 41124.492 ( 0.007 ms): libvirtd/1353 getrandom(buf: 0x7f7a1bfa35c0, count: 16, flags: NONBLOCK ) = 16 41124.470 ( 0.013 ms): fwupd/16120 getrandom(buf: 0x7f63243aa5c0, count: 16, flags: NONBLOCK ) = 16 41590.832 ( 0.014 ms): chrome/5957 getrandom(buf: 0x7fabac7b15b0, count: 16, flags: NONBLOCK ) = 16 41590.884 ( 0.004 ms): chrome/5957 getrandom(buf: 0x7fabac7b15c0, count: 16, flags: NONBLOCK ) = 16 Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-gca0n1p3aca3depey703ph2q@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0c8bcb94934e0..c45c1cfeb866d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -44,6 +44,7 @@ #include #include #include +#include /* For older distros: */ #ifndef MAP_STACK @@ -1045,6 +1046,29 @@ static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size, #define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags +static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_FLAG(n) \ + if (flags & GRND_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~GRND_##n; \ + } + + P_FLAG(RANDOM); + P_FLAG(NONBLOCK); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags + #define STRARRAY(arg, name, array) \ .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } @@ -1137,6 +1161,8 @@ static struct syscall_fmt { { .name = "getdents64", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, + { .name = "getrandom", .errmsg = true, + .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "getxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, -- GitLab From 46bc29b970f0011a9099077f1db8f3540aa829fe Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 8 Mar 2016 10:38:44 +0200 Subject: [PATCH 064/705] perf tools: Add time conversion event Intel PT uses the time members from the perf_event_mmap_page to convert between TSC and perf time. Due to a lack of foresight when Intel PT was implemented, those time members were recorded in the (implementation dependent) AUXTRACE_INFO event, the structure of which is generally inaccessible outside of the Intel PT decoder. However now the conversion between TSC and perf time is needed when processing a jitdump file when Intel PT has been used for tracing. So add a user event to record the time members. 'perf record' will synthesize the event if the information is available. And session processing will put a copy of the event on the session so that tools like 'perf inject' can easily access it. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1457426324-30158-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/tsc.c | 31 +++++++++++++++++++++++++++++++ tools/perf/builtin-inject.c | 1 + tools/perf/builtin-record.c | 15 +++++++++++++++ tools/perf/util/event.c | 1 + tools/perf/util/event.h | 9 +++++++++ tools/perf/util/session.c | 6 ++++++ tools/perf/util/session.h | 1 + tools/perf/util/tool.h | 1 + tools/perf/util/tsc.h | 10 ++++++++++ 9 files changed, 75 insertions(+) diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index fd2868490d00e..70ff7c14bea6a 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -46,3 +46,34 @@ u64 rdtsc(void) return low | ((u64)high) << 32; } + +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, + struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) +{ + union perf_event event = { + .time_conv = { + .header = { + .type = PERF_RECORD_TIME_CONV, + .size = sizeof(struct time_conv_event), + }, + }, + }; + struct perf_tsc_conversion tc; + int err; + + err = perf_read_tsc_conversion(pc, &tc); + if (err == -EOPNOTSUPP) + return 0; + if (err) + return err; + + pr_debug2("Synthesizing TSC conversion information\n"); + + event.time_conv.time_mult = tc.time_mult; + event.time_conv.time_shift = tc.time_shift; + event.time_conv.time_zero = tc.time_zero; + + return process(tool, &event, NULL, machine); +} diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d1a2d104f2bc1..e5afa8fe1bf11 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -748,6 +748,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) .auxtrace_info = perf_event__repipe_op2_synth, .auxtrace = perf_event__repipe_auxtrace, .auxtrace_error = perf_event__repipe_op2_synth, + .time_conv = perf_event__repipe_op2_synth, .finished_round = perf_event__repipe_oe_synth, .build_id = perf_event__repipe_op2_synth, .id_index = perf_event__repipe_op2_synth, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 515510ecc76a4..410035c6e300b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -29,6 +29,7 @@ #include "util/data.h" #include "util/perf_regs.h" #include "util/auxtrace.h" +#include "util/tsc.h" #include "util/parse-branch-options.h" #include "util/parse-regs-options.h" #include "util/llvm-utils.h" @@ -512,6 +513,15 @@ static void workload_exec_failed_signal(int signo __maybe_unused, static void snapshot_sig_handler(int sig); +int __weak +perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused, + struct perf_tool *tool __maybe_unused, + perf_event__handler_t process __maybe_unused, + struct machine *machine __maybe_unused) +{ + return 0; +} + static int record__synthesize(struct record *rec) { struct perf_session *session = rec->session; @@ -549,6 +559,11 @@ static int record__synthesize(struct record *rec) } } + err = perf_event__synth_time_conv(rec->evlist->mmap[0].base, tool, + process_synthesized_event, machine); + if (err) + goto out; + if (rec->opts.full_auxtrace) { err = perf_event__synthesize_auxtrace_info(rec->itr, tool, session, process_synthesized_event); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index dad55d04ffdd5..b689590376880 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -45,6 +45,7 @@ static const char *perf_event__names[] = { [PERF_RECORD_STAT] = "STAT", [PERF_RECORD_STAT_ROUND] = "STAT_ROUND", [PERF_RECORD_EVENT_UPDATE] = "EVENT_UPDATE", + [PERF_RECORD_TIME_CONV] = "TIME_CONV", }; const char *perf_event__name(unsigned int id) diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 6bb1c928350d4..8d363d5e65a2e 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -233,6 +233,7 @@ enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_STAT = 76, PERF_RECORD_STAT_ROUND = 77, PERF_RECORD_EVENT_UPDATE = 78, + PERF_RECORD_TIME_CONV = 79, PERF_RECORD_HEADER_MAX }; @@ -469,6 +470,13 @@ struct stat_round_event { u64 time; }; +struct time_conv_event { + struct perf_event_header header; + u64 time_shift; + u64 time_mult; + u64 time_zero; +}; + union perf_event { struct perf_event_header header; struct mmap_event mmap; @@ -497,6 +505,7 @@ union perf_event { struct stat_config_event stat_config; struct stat_event stat; struct stat_round_event stat_round; + struct time_conv_event time_conv; }; void perf_event__print_totals(void); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4abd85c6346dd..ef370557fb9ae 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -409,6 +409,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->stat = process_stat_stub; if (tool->stat_round == NULL) tool->stat_round = process_stat_round_stub; + if (tool->time_conv == NULL) + tool->time_conv = process_event_op2_stub; } static void swap_sample_id_all(union perf_event *event, void *data) @@ -794,6 +796,7 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_STAT] = perf_event__stat_swap, [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, + [PERF_RECORD_TIME_CONV] = perf_event__all64_swap, [PERF_RECORD_HEADER_MAX] = NULL, }; @@ -1341,6 +1344,9 @@ static s64 perf_session__process_user_event(struct perf_session *session, return tool->stat(tool, event, session); case PERF_RECORD_STAT_ROUND: return tool->stat_round(tool, event, session); + case PERF_RECORD_TIME_CONV: + session->time_conv = event->time_conv; + return tool->time_conv(tool, event, session); default: return -EINVAL; } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 5f792e35d4c1e..f96fc9e8c52e7 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -26,6 +26,7 @@ struct perf_session { struct itrace_synth_opts *itrace_synth_opts; struct list_head auxtrace_index; struct trace_event tevent; + struct time_conv_event time_conv; bool repipe; bool one_mmap; void *one_mmap_addr; diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 55de4cffcd4e9..ac2590a3de2d1 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -57,6 +57,7 @@ struct perf_tool { id_index, auxtrace_info, auxtrace_error, + time_conv, thread_map, cpu_map, stat_config, diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h index a8b78f1b32438..280ddc067556a 100644 --- a/tools/perf/util/tsc.h +++ b/tools/perf/util/tsc.h @@ -3,10 +3,20 @@ #include +#include "event.h" #include "../arch/x86/util/tsc.h" u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc); u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc); u64 rdtsc(void); +struct perf_event_mmap_page; +struct perf_tool; +struct machine; + +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, + struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine); + #endif -- GitLab From 274529ba9bda86c91c2c06da3a641aaf617dd30f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Mar 2016 19:46:04 -0700 Subject: [PATCH 065/705] rcu: Consolidate dumping of ftrace buffer This commit consolidates a couple definitions and several calls for single-shot ftrace-buffer dumping. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 13 +++++++++++++ kernel/rcu/rcutorture.c | 17 +++-------------- kernel/rcu/tree.c | 4 ++-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2657aff2725b4..45de591657a6f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1144,4 +1144,17 @@ static inline void rcu_sysidle_force_exit(void) #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 250ea67c1615b..463867c432216 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a535a86e7326..531a328076bdd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -637,7 +637,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -799,7 +799,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ -- GitLab From 70946a44deec299ef54c0ec933e8d82ddd4bcc6a Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Mon, 7 Mar 2016 16:02:14 +0800 Subject: [PATCH 066/705] documentation: Make sample code and documentation consistent In the chapter 'analogy with reader-writer locking', the sample code uses spinlock_t in reader-writer case. Just correct it so that we can read the document easily. Signed-off-by: Yao Dongdong Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index dc49c6712b17f..111770ffa10e7 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of RCU is analogous to reader-writer locking. The following unified diff shows how closely related RCU and reader-writer locking can be. + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + @@ -13,15 +14,15 @@ struct list_head *lp; struct el *p; - - read_lock(); + - read_lock(&listmutex); - list_for_each_entry(p, head, lp) { + rcu_read_lock(); + list_for_each_entry_rcu(p, head, lp) { if (p->key == key) { *result = p->data; - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 1; } } - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 0; } @@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing: 5 int data; 5 int data; 6 /* Other data fields */ 6 /* Other data fields */ 7 }; 7 }; - 8 spinlock_t listmutex; 8 spinlock_t listmutex; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; 9 struct el head; 9 struct el head; 1 int search(long key, int *result) 1 int search(long key, int *result) @@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing: 3 struct list_head *lp; 3 struct list_head *lp; 4 struct el *p; 4 struct el *p; 5 5 - 6 read_lock(); 6 rcu_read_lock(); + 6 read_lock(&listmutex); 6 rcu_read_lock(); 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { 8 if (p->key == key) { 8 if (p->key == key) { 9 *result = p->data; 9 *result = p->data; -10 read_unlock(); 10 rcu_read_unlock(); +10 read_unlock(&listmutex); 10 rcu_read_unlock(); 11 return 1; 11 return 1; 12 } 12 } 13 } 13 } -14 read_unlock(); 14 rcu_read_unlock(); +14 read_unlock(&listmutex); 14 rcu_read_unlock(); 15 return 0; 15 return 0; 16 } 16 } -- GitLab From 41abcf321d447b9987f6b7d1a9bb65831e786daf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Dec 2015 16:18:22 -0800 Subject: [PATCH 067/705] documentation: Add real-time requirements from CPU-bound workloads This commit records RCU's responsibility to avoid degrading latencies of CPUs running tight loops within properly configured workloads, both in kernel and in userspace. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 10 +++++++++- .../RCU/Design/Requirements/Requirements.htmlx | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a725f9900ec89..3004baa71bcc3 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,5 @@ - + @@ -2170,6 +2170,14 @@ up to and including systems with 4096 CPUs. This real-time requirement motivated the grace-period kthread, which also simplified handling of a number of race conditions. +

+RCU must avoid degrading real-time response for CPU-bound threads, whether +executing in usermode (which is one use case for +CONFIG_NO_HZ_FULL=y) or in the kernel. +That said, CPU-bound loops in the kernel must execute +cond_resched_rcu_qs() at least once per few tens of milliseconds +in order to avoid receiving an IPI from RCU. +

Finally, RCU's status as a synchronization primitive means that any RCU failure can result in arbitrary memory corruption that can be diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 3a97ba490c42b..61caffc86823d 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -2337,6 +2337,14 @@ up to and including systems with 4096 CPUs. This real-time requirement motivated the grace-period kthread, which also simplified handling of a number of race conditions. +

+RCU must avoid degrading real-time response for CPU-bound threads, whether +executing in usermode (which is one use case for +CONFIG_NO_HZ_FULL=y) or in the kernel. +That said, CPU-bound loops in the kernel must execute +cond_resched_rcu_qs() at least once per few tens of milliseconds +in order to avoid receiving an IPI from RCU. +

Finally, RCU's status as a synchronization primitive means that any RCU failure can result in arbitrary memory corruption that can be -- GitLab From f43b62542eb61a52d97d6b82a786a912fa5e6c51 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Jan 2016 09:12:43 -0800 Subject: [PATCH 068/705] documentation: Add synchronize_rcu_mult() to the requirements Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 92 +++++++++++++++++++ .../Design/Requirements/Requirements.htmlx | 82 +++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 3004baa71bcc3..59acd82e67d4a 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2231,6 +2231,8 @@ described in a separate section.

  • Sched Flavor
  • Sleepable RCU
  • Tasks RCU +
  • + Waiting for Multiple Grace Periods

    Bottom-Half Flavor

    @@ -2480,6 +2482,81 @@ The tasks-RCU API is quite compact, consisting only of synchronize_rcu_tasks(), and rcu_barrier_tasks(). +

    +Waiting for Multiple Grace Periods

    + +

    +Perhaps you have an RCU protected data structure that is accessed from +RCU read-side critical sections, from softirq handlers, and from +hardware interrupt handlers. +That is three flavors of RCU, the normal flavor, the bottom-half flavor, +and the sched flavor. +How to wait for a compound grace period? + +

    +The best approach is usually to “just say no!” and +insert rcu_read_lock() and rcu_read_unlock() +around each RCU read-side critical section, regardless of what +environment it happens to be in. +But suppose that some of the RCU read-side critical sections are +on extremely hot code paths, and that use of CONFIG_PREEMPT=n +is not a viable option, so that rcu_read_lock() and +rcu_read_unlock() are not free. +What then? + +

    +You could wait on all three grace periods in succession, as follows: + +

    +
    + 1 synchronize_rcu();
    + 2 synchronize_rcu_bh();
    + 3 synchronize_sched();
    +
    +
    + +

    +This works, but triples the update-side latency penalty. +In cases where this is not acceptable, synchronize_rcu_mult() +may be used to wait on all three flavors of grace period concurrently: + +

    +
    + 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
    +
    +
    + +

    +But what if it is necessary to also wait on SRCU? +This can be done as follows: + +

    +
    + 1 static void call_my_srcu(struct rcu_head *head,
    + 2        void (*func)(struct rcu_head *head))
    + 3 {
    + 4   call_srcu(&my_srcu, head, func);
    + 5 }
    + 6
    + 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
    +
    +
    + +

    +If you needed to wait on multiple different flavors of SRCU +(but why???), you would need to create a wrapper function resembling +call_my_srcu() for each SRCU flavor. + +

    Quick Quiz 15: +But what if I need to wait for multiple RCU flavors, but I also need +the grace periods to be expedited? +
    Answer + +

    +Again, it is usually better to adjust the RCU read-side critical sections +to use a single flavor of RCU, but when this is not feasible, you can use +synchronize_rcu_mult(). +

    Possible Future Changes

    @@ -2901,5 +2978,20 @@ during scheduler initialization.

    Back to Quick Quiz 14. + +

    Quick Quiz 15: +But what if I need to wait for multiple RCU flavors, but I also need +the grace periods to be expedited? + + +

    Answer: +If you are using expedited grace periods, there should be less penalty +for waiting on them in succession. +But if that is nevertheless a problem, you can use workqueues or multiple +kthreads to wait on the various expedited grace periods concurrently. + + +

    Back to Quick Quiz 15. + diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 61caffc86823d..6ff4966672e2e 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -2398,6 +2398,8 @@ described in a separate section.

  • Sched Flavor
  • Sleepable RCU
  • Tasks RCU +
  • + Waiting for Multiple Grace Periods

    Bottom-Half Flavor

    @@ -2647,6 +2649,86 @@ The tasks-RCU API is quite compact, consisting only of synchronize_rcu_tasks(), and rcu_barrier_tasks(). +

    +Waiting for Multiple Grace Periods

    + +

    +Perhaps you have an RCU protected data structure that is accessed from +RCU read-side critical sections, from softirq handlers, and from +hardware interrupt handlers. +That is three flavors of RCU, the normal flavor, the bottom-half flavor, +and the sched flavor. +How to wait for a compound grace period? + +

    +The best approach is usually to “just say no!” and +insert rcu_read_lock() and rcu_read_unlock() +around each RCU read-side critical section, regardless of what +environment it happens to be in. +But suppose that some of the RCU read-side critical sections are +on extremely hot code paths, and that use of CONFIG_PREEMPT=n +is not a viable option, so that rcu_read_lock() and +rcu_read_unlock() are not free. +What then? + +

    +You could wait on all three grace periods in succession, as follows: + +

    +
    + 1 synchronize_rcu();
    + 2 synchronize_rcu_bh();
    + 3 synchronize_sched();
    +
    +
    + +

    +This works, but triples the update-side latency penalty. +In cases where this is not acceptable, synchronize_rcu_mult() +may be used to wait on all three flavors of grace period concurrently: + +

    +
    + 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
    +
    +
    + +

    +But what if it is necessary to also wait on SRCU? +This can be done as follows: + +

    +
    + 1 static void call_my_srcu(struct rcu_head *head,
    + 2        void (*func)(struct rcu_head *head))
    + 3 {
    + 4   call_srcu(&my_srcu, head, func);
    + 5 }
    + 6
    + 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
    +
    +
    + +

    +If you needed to wait on multiple different flavors of SRCU +(but why???), you would need to create a wrapper function resembling +call_my_srcu() for each SRCU flavor. + +

    @@QQ@@ +But what if I need to wait for multiple RCU flavors, but I also need +the grace periods to be expedited? +

    @@QQA@@ +If you are using expedited grace periods, there should be less penalty +for waiting on them in succession. +But if that is nevertheless a problem, you can use workqueues or multiple +kthreads to wait on the various expedited grace periods concurrently. +

    @@QQE@@ + +

    +Again, it is usually better to adjust the RCU read-side critical sections +to use a single flavor of RCU, but when this is not feasible, you can use +synchronize_rcu_mult(). +

    Possible Future Changes

    -- GitLab From d8936c0b7e29510ce8f5c85ff5fcc592a938e860 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Feb 2016 17:29:47 -0800 Subject: [PATCH 069/705] documentation: Explain why rcu_read_lock() needs no barrier() This commit adds a Quick Quiz whose answer explains why the compiler code reordering enabled by CONFIG_PREEMPT=n's empty rcu_read_lock() and rcu_read_unlock() functions does not hinder RCU's ability to figure out which RCU read-side critical sections have completed and not. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 130 ++++++++++++------ .../Design/Requirements/Requirements.htmlx | 28 ++++ 2 files changed, 115 insertions(+), 43 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 59acd82e67d4a..2a56031bfdd4f 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -583,6 +583,17 @@ The first and second guarantees require unbelievably strict ordering! Are all these memory barriers really required?
    Answer +

    Quick Quiz 7: +You claim that rcu_read_lock() and rcu_read_unlock() +generate absolutely no code in some kernel builds. +This means that the compiler might arbitrarily rearrange consecutive +RCU read-side critical sections. +Given such rearrangement, if a given RCU read-side critical section +is done, how can you be sure that all prior RCU read-side critical +sections are done? +Won't the compiler rearrangements make that impossible to determine? +
    Answer +

    Note that these memory-barrier requirements do not replace the fundamental RCU requirement that a grace period wait for all pre-existing readers. @@ -626,9 +637,9 @@ inconvenience can be avoided through use of the call_rcu() and kfree_rcu() API members described later in this document. -

    Quick Quiz 7: +

    Quick Quiz 8: But how does the upgrade-to-write operation exclude other readers? -
    Answer +
    Answer

    This guarantee allows lookup code to be shared between read-side @@ -714,9 +725,9 @@ to do significant reordering. This is by design: Any significant ordering constraints would slow down these fast-path APIs. -

    Quick Quiz 8: +

    Quick Quiz 9: Can't the compiler also reorder this code? -
    Answer +
    Answer

    Readers Do Not Exclude Updaters

    @@ -769,10 +780,10 @@ new readers can start immediately after synchronize_rcu() starts, and synchronize_rcu() is under no obligation to wait for these new readers. -

    Quick Quiz 9: +

    Quick Quiz 10: Suppose that synchronize_rcu() did wait until all readers had completed. Would the updater be able to rely on this? -
    Answer +
    Answer

    Grace Periods Don't Partition Read-Side Critical Sections

    @@ -969,11 +980,11 @@ grace period. As a result, an RCU read-side critical section cannot partition a pair of RCU grace periods. -

    Quick Quiz 10: +

    Quick Quiz 11: How long a sequence of grace periods, each separated by an RCU read-side critical section, would be required to partition the RCU read-side critical sections at the beginning and end of the chain? -
    Answer +
    Answer

    Disabling Preemption Does Not Block Grace Periods

    @@ -1127,9 +1138,9 @@ synchronization primitives be legal within RCU read-side critical sections, including spinlocks, sequence locks, atomic operations, reference counters, and memory barriers. -

    Quick Quiz 11: +

    Quick Quiz 12: What about sleeping locks? -
    Answer +
    Answer

    It often comes as a surprise that many algorithms do not require a @@ -1354,12 +1365,12 @@ write an RCU callback function that takes too long. Long-running operations should be relegated to separate threads or (in the Linux kernel) workqueues. -

    Quick Quiz 12: +

    Quick Quiz 13: Why does line 19 use rcu_access_pointer()? After all, call_rcu() on line 25 stores into the structure, which would interact badly with concurrent insertions. Doesn't this mean that rcu_dereference() is required? -
    Answer +
    Answer

    However, all that remove_gp_cb() is doing is @@ -1406,14 +1417,14 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx, so the very few places that needed something like synchronize_rcu() simply open-coded it. -

    Quick Quiz 13: +

    Quick Quiz 14: Earlier it was claimed that call_rcu() and kfree_rcu() allowed updaters to avoid being blocked by readers. But how can that be correct, given that the invocation of the callback and the freeing of the memory (respectively) must still wait for a grace period to elapse? -
    Answer +
    Answer

    But what if the updater must wait for the completion of code to be @@ -1838,11 +1849,11 @@ kthreads to be spawned. Therefore, invoking synchronize_rcu() during scheduler initialization can result in deadlock. -

    Quick Quiz 14: +

    Quick Quiz 15: So what happens with synchronize_rcu() during scheduler initialization for CONFIG_PREEMPT=n kernels? -
    Answer +
    Answer

    I learned of these boot-time requirements as a result of a series of @@ -2547,10 +2558,10 @@ If you needed to wait on multiple different flavors of SRCU (but why???), you would need to create a wrapper function resembling call_my_srcu() for each SRCU flavor. -

    Quick Quiz 15: +

    Quick Quiz 16: But what if I need to wait for multiple RCU flavors, but I also need the grace periods to be expedited? -
    Answer +
    Answer

    Again, it is usually better to adjust the RCU read-side critical sections @@ -2827,18 +2838,51 @@ adhered to the as-if rule than it is to actually adhere to it!

    Quick Quiz 7: -But how does the upgrade-to-write operation exclude other readers? +You claim that rcu_read_lock() and rcu_read_unlock() +generate absolutely no code in some kernel builds. +This means that the compiler might arbitrarily rearrange consecutive +RCU read-side critical sections. +Given such rearrangement, if a given RCU read-side critical section +is done, how can you be sure that all prior RCU read-side critical +sections are done? +Won't the compiler rearrangements make that impossible to determine?

    Answer: -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. +In cases where rcu_read_lock() and rcu_read_unlock() +generate absolutely no code, RCU infers quiescent states only at +special locations, for example, within the scheduler. +Because calls to schedule() had better prevent calling-code +accesses to shared variables from being rearranged across the call to +schedule(), if RCU detects the end of a given RCU read-side +critical section, it will necessarily detect the end of all prior +RCU read-side critical sections, no matter how aggressively the +compiler scrambles the code. + +

    +Again, this all assumes that the compiler cannot scramble code across +calls to the scheduler, out of interrupt handlers, into the idle loop, +into user-mode code, and so on. +But if your kernel build allows that sort of scrambling, you have broken +far more than just RCU!

    Back to Quick Quiz 7.

    Quick Quiz 8: +But how does the upgrade-to-write operation exclude other readers? + + +

    Answer: +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. + + +

    Back to Quick Quiz 8. + + +

    Quick Quiz 9: Can't the compiler also reorder this code? @@ -2848,10 +2892,10 @@ No, the volatile casts in READ_ONCE() and this particular case. -

    Back to Quick Quiz 8. +

    Back to Quick Quiz 9. - -

    Quick Quiz 9: + +

    Quick Quiz 10: Suppose that synchronize_rcu() did wait until all readers had completed. Would the updater be able to rely on this? @@ -2866,10 +2910,10 @@ Therefore, the code following in any case. -

    Back to Quick Quiz 9. +

    Back to Quick Quiz 10. - -

    Quick Quiz 10: + +

    Quick Quiz 11: How long a sequence of grace periods, each separated by an RCU read-side critical section, would be required to partition the RCU read-side critical sections at the beginning and end of the chain? @@ -2883,10 +2927,10 @@ Therefore, even in practice, RCU users must abide by the theoretical rather than the practical answer. -

    Back to Quick Quiz 10. +

    Back to Quick Quiz 11. - -

    Quick Quiz 11: + +

    Quick Quiz 12: What about sleeping locks? @@ -2914,10 +2958,10 @@ the mutex was not immediately available. Either way, mutex_trylock() returns immediately without sleeping. -

    Back to Quick Quiz 11. +

    Back to Quick Quiz 12. - -

    Quick Quiz 12: + +

    Quick Quiz 13: Why does line 19 use rcu_access_pointer()? After all, call_rcu() on line 25 stores into the structure, which would interact badly with concurrent insertions. @@ -2933,10 +2977,10 @@ is released on line 25, which in turn means that rcu_access_pointer() suffices. -

    Back to Quick Quiz 12. +

    Back to Quick Quiz 13. - -

    Quick Quiz 13: + +

    Quick Quiz 14: Earlier it was claimed that call_rcu() and kfree_rcu() allowed updaters to avoid being blocked by readers. @@ -2957,10 +3001,10 @@ next update as soon as it has invoked call_rcu() or grace period. -

    Back to Quick Quiz 13. +

    Back to Quick Quiz 14. - -

    Quick Quiz 14: + +

    Quick Quiz 15: So what happens with synchronize_rcu() during scheduler initialization for CONFIG_PREEMPT=n kernels? @@ -2976,10 +3020,10 @@ so it is still necessary to avoid invoking synchronize_rcu() during scheduler initialization. -

    Back to Quick Quiz 14. +

    Back to Quick Quiz 15. - -

    Quick Quiz 15: + +

    Quick Quiz 16: But what if I need to wait for multiple RCU flavors, but I also need the grace periods to be expedited? @@ -2991,7 +3035,7 @@ But if that is nevertheless a problem, you can use workqueues or multiple kthreads to wait on the various expedited grace periods concurrently. -

    Back to Quick Quiz 15. +

    Back to Quick Quiz 16. diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 6ff4966672e2e..98da30ca84c46 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -682,6 +682,34 @@ That said, it is much easier to fool yourself into believing that you have adhered to the as-if rule than it is to actually adhere to it!

    @@QQE@@ +

    @@QQ@@ +You claim that rcu_read_lock() and rcu_read_unlock() +generate absolutely no code in some kernel builds. +This means that the compiler might arbitrarily rearrange consecutive +RCU read-side critical sections. +Given such rearrangement, if a given RCU read-side critical section +is done, how can you be sure that all prior RCU read-side critical +sections are done? +Won't the compiler rearrangements make that impossible to determine? +

    @@QQA@@ +In cases where rcu_read_lock() and rcu_read_unlock() +generate absolutely no code, RCU infers quiescent states only at +special locations, for example, within the scheduler. +Because calls to schedule() had better prevent calling-code +accesses to shared variables from being rearranged across the call to +schedule(), if RCU detects the end of a given RCU read-side +critical section, it will necessarily detect the end of all prior +RCU read-side critical sections, no matter how aggressively the +compiler scrambles the code. + +

    +Again, this all assumes that the compiler cannot scramble code across +calls to the scheduler, out of interrupt handlers, into the idle loop, +into user-mode code, and so on. +But if your kernel build allows that sort of scrambling, you have broken +far more than just RCU! +

    @@QQE@@ +

    Note that these memory-barrier requirements do not replace the fundamental RCU requirement that a grace period wait for all pre-existing readers. -- GitLab From 514f1eb5f44520d5255b927ad5aabc00db5bc73d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Feb 2016 16:52:35 -0800 Subject: [PATCH 070/705] documentation: Document illegality of call_rcu() from offline CPUs There is already a blanket statement about no member of RCU's API being legal from an offline CPU, but add an explicit note where it states that it is illegal to invoke call_rcu() from an NMI handler. Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 3 ++- Documentation/RCU/Design/Requirements/Requirements.htmlx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 2a56031bfdd4f..01e12b86e81fd 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1354,7 +1354,8 @@ situations where neither synchronize_rcu() nor synchronize_rcu_expedited() would be legal, including within preempt-disable code, local_bh_disable() code, interrupt-disable code, and interrupt handlers. -However, even call_rcu() is illegal within NMI handlers. +However, even call_rcu() is illegal within NMI handlers +and from offline CPUs. The callback function (remove_gp_cb() in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 98da30ca84c46..3355f1f9384c9 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -1513,7 +1513,8 @@ situations where neither synchronize_rcu() nor synchronize_rcu_expedited() would be legal, including within preempt-disable code, local_bh_disable() code, interrupt-disable code, and interrupt handlers. -However, even call_rcu() is illegal within NMI handlers. +However, even call_rcu() is illegal within NMI handlers +and from offline CPUs. The callback function (remove_gp_cb() in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, -- GitLab From 11a65df5732167519937eabf16a870f5f8bde5ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Mar 2016 11:03:36 -0700 Subject: [PATCH 071/705] documentation: Remove unnecessary images from requirements This commit removes a cutesy cartoon and also a diagram that can just as easily be represented by text. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- .../Requirements/2013-08-is-it-dead.png | Bin 100825 -> 0 bytes .../Design/Requirements/RCUApplicability.svg | 237 ------------------ .../RCU/Design/Requirements/Requirements.html | 28 ++- .../Design/Requirements/Requirements.htmlx | 28 ++- 4 files changed, 40 insertions(+), 253 deletions(-) delete mode 100644 Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png delete mode 100644 Documentation/RCU/Design/Requirements/RCUApplicability.svg diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png deleted file mode 100644 index 7496a55e4e7b41becdb658a2bd34765fbe80013f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 100825 zcmaI8c|2C@8#TTWl}t}j$kbCx3K5Zr>Mlnb z5(*hAnadoRertEW@8|b<|9Ly7Pv__v_P+1yzOHqxwXS{jsE!6ZD?cki5bOsvX~zhH z$%G&n9aga5CvPs*`Qi@-w__UniR||R1Na{nvqKs*Vu}1Gu{<>jKUwLlY3PQxx01gZ z_KU@N;fKrJ4<1%q{*{%ohGTi>PcaUH5FrlI_UU=G4SzlFclgiJ^3m2e#`{@TYrXrd zCi=iA#kMd-Ho7gN&_g6+I_)pN#K@6M>l2Up)zT$SjzqR}Zcwik%8!cc`13nLA@xt) z;Y}2aviOJ^|6<4YehF?)@3$qmsR%REXmrLzMZ$@gVkl;GX6c@+t=zSphv)bJVMfpx zLUzr_^H+HgTNv)K+<1A6LeceHcbcX*Z@^^D-pxaFh9-Gk)jW87jggU2l3w~TXIIyY zj~`2H(|EPhJ0*4F=FKd+y1F-S-@e`a`@JmDC(M!;MR+n6Hw;oJld-v{j|M;=Y*Vp&t^;)`4>H&9mSu-;;E+L`o zPpleVbae?YE-n_k53K&!)YN=%ikN0FSZTVEu5dCUBxIdAO|PJwFilGmQ>Oh;Bb;4a zB7c2-w#}if;6iHZmX5Dq4MmguZrr{dveW8)cYkBDLAj6I;=-SMcT;b44*y}&qa?0uX$B#F8s6KU)#$_Qq+fJJH z*2NT`{~iA~UzpxVywBfH!>^z?X8MNEq{|FkTefUjkni%_h(*~?YtZoVDIM-8InFQd zxSd$WC+M}E`DnbbA&vHG`KgMK(AZl<7W2`jx;m}BbHh6=JBnSzPGoRzJD{hUb?Pa{ z_t)2|udSC)6w|sH63#rTyzKx@Z}QHQWi&N)eKp-8M(5v+$;)>+v^^|aTAa(xJNtW- z#(eO;qQ?%4TCU>a;_jKT?`z+@dE>;8#%j7;#z%V-HHjK3VL}T^;G+)^uiHLhN1o_&r!djG7+yY6q|w++XnH&RSWWF0{!UqTRrs^+X%-PL z<5qhA(J#@J&R@{Q=>(yL5icD%RU$wU0dLEXx!qwTiYOpzzM@^S7-M(!r*_VIMun9k^b;GX;`5)!)%1!dxg*CH z=Kko{MQ^RP<>EPO5-^+c*PShMA5BkobPsl=-q6i$;#T&VXTjnpGIIH8Y8~sH$}TW1 z=OC);2_1YgEoeQ0hLv+Y;jv!MR&V>ITgEw3(_09;qx0?~Z72K(U)gd|r2Qz3Pmc4@ z4W|a1S9Ju7ii)27l5cs$VHT0#)f=Oq8@ROCSDFwI5D;?ps<-5j&cTEGnjRg_vuY5k zSQARQ^vBoZ~zhByHG~kb^Tu2CSMM(db zbPnDH(Xy&y|G8a+h=hbh>x86X{xXEo%lG&9Jh}H10Z9;h_B`t=ium*E>j57h#b2G} zalh-0i|qOMWv%ItPd#1R&;Q|xex8|{!KqVrzXPfN;?$~Ikf@8|{+@MReiQH3_*)Iy1a#RVPBo7lsd;eV ziqI*h<;$02F`A@ocJ7RpIJGu3sTli#bHjW{RnJ-q-D_Vx_46UT8K`*W_?z_evp7A*EWk(cFT9m%a73 z9<&afJsbHaqO9-hIM4d^aeagq5)gTQXb&GgBnTX`c&*Wph={Uc4~J4W%PZ|7Iswyigb zR|!ejM@U?ER}@T2N(wh##m?URUZ09PxP~+m?bNeP{_%eE*Wu4EE$963#Oeqe?+zj| zGV->JC9779I5SdVwD<{DK}C1^ojZn)P9|KwyliH6_AHL~=gL48Y(`Ii{|YB3r?=P( z%erW3qt-W3TXdf~p85Rc%Z3db7(9krf>KgM7iar~$$WTA^349DN4b#K$9_Hg{cWn% zq@h46Jw2UB^qhMju^JcS#c;(MeorI@H+OOp1?od!ve9_dj#;xytP+VNx8xOpKtUswo5?+6K zV%0M+a5Y}pH_777%_Dw7_jbFnWS)HdqTKgSV$;BfVi$Aw;m^|TJ(T?i4v!+HlYqrp6FHu;CTB`*A&DbQ$mMy8;O&JmSmi1bGeq}}3I7X~Q zI=(+7R4uYq>PN^qae~<8)H15vs(YLxmszn1c#cr7Ee@5-)R&LPmf{x(7+L}qKQgo)^(SZ=~~sO7Q1;hff? z!oGKR<<<}~jvWfgHlNnAl^aVDn-mp|_Abr4F^Cu&8|V8>*&aN6xcO0~`@qeXq{hm? zz~q*##J8V6AC;f8vDtbeLoc#HW$E;TrH|q`}*4YoeuTLc(2~QJ51Exm9uj#3O}l z#jnoD#N?*zgatj!!WpBJx`s$<8n8e~%eQS7Pwr0A!UjuzW_Mzr2l{2E7e9Mum$O?@ zk^k$jTg?v9!TT(K)JA^9s*a6~MV_q=zIy4>CBLa=eW3c8ms7rTBTt#>;k?q+dK*I1 z%q+@%6~o;bAcikdS9(qs@(_L=0O zMB?b^5}Fe~-yP20C@FdUDRNHZ&Q~1rA^mSuG=(}HZ{NzxMJO}Ox5msRBSoCGvAK$@ zkbEpf3yA;9n}Mvyk9&UnxZK}w-BIds$oI_G&DGa=hKGl_)~^q{eVaF`-qp?R0FAb6 z|AD1vo!25FG&D6=;61vy2B0b)z%bdW+b5aIjU&iB(An9Uo10tp=y06&`}GtG1EJtMy&ZKyx5)lWjpNhg`8 z&LZ>;ik;6E{kvCp_kwA`gy9$KM!TwBMI7@zq*uA+QE<*sYYw?+yn8&i)0P1r7@!V6 z65^WJLm!;-30blE@ZHP80W&>$aw}G?YO>|x8P<*c>O>1|P1VpL0e*Az zJiI_G?;EFo$ZBVN?acf-YNf$$ZDV88HvVLlgYEJygVSj>o3>1()$2$2pY#y@4_@xq z5IntG>Qd|8U$J|tbH|FidxnRz95l4FSP?FT4l?!rqFE+j3x6W&+LAVY`vgYu8>OtQ&LBR%=hpPc*2sxvgKnUe@;03c`8p8%iCms<~ON z-W$a>e3AFYEfrQS3PqWH*fu^S{`TLmeAU@66E=CF!DKoR%(hQYPcIvbTy?Dap3k$p zJt8k`Ri)>3Zf%${JHsAR75Fqe{kg+v_w$*`r8yO%>bl^b9rmAdm0BD0>h(=cMS?j5 zLqkGVh=nEvJ*p{OR5oHlO_xO=x8|7CWEnmgpDNCjFbRI@y>}4p_)PaDPJ%$={L(I` zoQ0Eh(EI>+kDvqQn`8Zpy~e22jT`gM|28gg>Jqwo_3DnDJF9POI6%y|=G*X=T_p>u zYf_b`Tvx`4Z44slgX^ctq@|^gW$0~X5J6vRbpG4nyRwm}=@iQ2xyeGi)^pFYuqk#E zTSP^#hG^m)%ya;iGjWc?pYs@10ag!7_BHF9AeRF*Sm%8C@?|9f_Y(FUwGiX{on0)4$=W1bL zC+>{XXU_EE9=}MYYD5UN$$WZ#WjPz~4I6lSh2&Q7bJ=`1FJ3x&GuMo%3bxZ{!UCEmY(A0b#<>eN-y zCc|czcHK86So3(qcyE3E72$x}-`(CeG#nu`N7@Q^O4Pr9uY)zScooXCT2SzQFRx=) z;8OY8ismcH$;ofys*oVx-O?7lZqNB&(SQi`h@Zn8iNn8tFMYO5+gA^;|IGQw;lnW~s}`j#kDz=i{kB1oo*qV51N`LzVWaUm6JOG zmQ3tak$oiEsDrk)gXVn)H8pPpE-mPz`cYf6&(yop`*3HPXgjOdk-NxqZS=%ZG$2}t zMN5ztuZmazL&nkJ)rInGi~n1SPfRORQuZ39ZxJEiqRjH_?v^BKGVSW>&-s>z^0F_K zmn-jb>YRT&_*hF*^FwL;7AzVXZEBqc(tp~ir;^8x9gDZVVe`KqC)pbq8JPi|pb(75LB!BYXrT_nb{=Z(xvzP4r0vG)v+jFoG z>z+Dw{bbk-$ZO~Pt7G@ti~q4Av%E@PJBoa-qhT&CJszMfw z1RIhUK$aRqdEw7$NBnGSSD!$E%rD9yoBo zW#FSIQ3dV^%!Fk;+cL^Nd`G9 zo#u~h-x_~QO!wlJ>YaLA4a;Nh7`}c-RlG0RIx*<3NH)PD2FuPeZ(ccD?w@b(TmqTQ zQ#%^ddh}pP-|ycM`X>H7kBtgn6gjlBc1+)ii|bSLyRYoKcGcSLRT(ET!yWbA-Q8(% zyBtP89+G+v0EI){R*gTeU%w8Da5iOIxlm(cV@P;7O;0a++R55lOy{1$S{P-Afvgz}S%nCk}V*mP0lq_|yaE%2v$3LYX*EnT-mRndj0v&x1YSYZz z?@AXGh+t~Iwr;c`b)1PFR9mbT`7dz9R9@&?{0MA4>20$CeLJ^S?Dnft6*EbyBhPvQ zr2UlljI=+k`SB*IvBa&9aUs4|(hsaGAL#8@3v+zudyp&GkfurMWV?h!*xNi1ovP>~ z8cRLwh$Fu8thwydp4vqI2|_srSmQ zG2$m)pb-SCV~{v~=Bt#Oo7?BlpGPgc)IoObu&m?$gl1B;a*K9Q4)DWde>Jbjf4~xog)hPa=RRmzBP;!STOY6zjSAALf@@RL z(OlnF^bprc*!#F_e!QOea6mVDC!ypWOreYw9iaNqfb5czlOylm4FzZNqN77#>(;Hg zMMdvr9*TvUeVXz_3WJ(pLZY$Uyu1rxVXHa==0<8*O6^*+&o0i-0+tAXtcr|}zuw(# z`tsGQ2j|}1HS|<<{`2e1Jw*>zz$J}i$5w+~6A%>a`S$H1_75}&O9|_-!Jt3M7mlSi z^?-yx>QF+z(*5h#b<|Lc5&uxtER(^-XLJN+%E6H)$}EJIr6Rg$pSww$>DjW#}fe08$38T(g;Mq|&p*>WnEi zq2Z2$B%Wa{`%Tm?F3k1X@GCec5;xFRgoS9@+GY@F*w(waxkVvy)&U)>(Jau`FsdTu z&}bF_UgMpIQ3$OOAHp3@nameI*>9zCWd zLg^#=u)HTv>I;UU_8dpEBO_ssE?T{vQx(DfvZ?7fvQO;DU-Y}ZKvE<@9? z`}e^W>Vj_=ceih9YU%|!a1%89)v&NI;sz+cu$-J6w5s*iZIw%lr{2DO`?9ms7}tNR zR|X8I_|Km|KLS%~tL$;^rn~>@_+l-L@JRtB25KutyXiONCZj@IQNU!h&w_hAR}kr- zm*!{t622h}^%_V3HtFO4$p3V%>4mb{pK)z_W49hx2N)=vczZ)9H3~GRZN1;vt98Av z)|to}9YLw;g;4aZr)LB4_Gzcjp_VMejUfACcf0+pMZOnRP%xm;DrUZ| z;@j=|aM|+Z>(A)@8EQR(Lbd_it}Aj-`q2j$TNqQC1`0e*WEtoZwMcUXy$m;Sh8G?n84UN<1`s6?!K&^7d^p2v9)lIfc3QT@`0mT}D)^?^Dfb zwd-M`&&O%p;3fDfBI8{`;N~%m`X|8Lj;OM*V7`Jkt zsbdXSgXR?gSJko(t@%G*06yJBbSMA zNBQFizTjJNfzTBhRedJ=$=x+9+U(K)Vc&Iu-ANKnU0o{e{x)p|=`9y;u0fL&lMuLQ zcwJaU66Zz6|F1g|Prd9JprHbX9aC<{zkU1m5SfHgb@P$!t`xs&(qbkJX%yMKtIR+7Ax@V2^DBt;DMGcFcAL#Phcb1jtR>=e0 ziX=PZTq#Eda(jIuvjp)2I3eF_%xt9WYk|$0H76v&*J^@LnhZ^P$dL9X!ksF9ede(# zRp#J`>VceA{2MpyvsYPJ5O=KL_LGfN@{y14oQ{RiGd+*%O+!thhhOI)tOad8x3w`6 zNYAELR<~?+v-8Um#4;8Zeh-GF;_FJEdi&f*J0~aI!I5ti6I+3f<`VMAn*lbHN}nll zq@;%nOCTNH!8&oRKHOi{pv4Y_y|*Qo#2niDqS3~ zp!l;PAp&8|NUYs%Kt~eimq-ObfQ~}?9 z<}Uh_xADsBr>Cb^u(7>$H4N^lcP74Pz0k+C6xBmHJpjJ5pjGO0aU$~*4fN&TQ1;b( zB)0SnJ!#l`0A=Zk;HZ4&0ttJ6zP}&W>xVP)8Z4Q#*~=9pa!^*Y&OBR7N;&A3T>v+= zCk)|X!3~r2&RSDp=%PKKk@~wTRdAQLM%)Lb!AAtItsttfyx<_J!CQ|(lQ9J~M@p6~ z?EKL-y9aMOpoOXLP6;kRIU+T}EjssVW&G0H&l+@Tq&#{=-LZoU?Un{U!L2Of8o%5N zoP0VMT#|XjW!JlQ;#lyW|J6b2CL}wJgEMh}%GnvX)bC^)P*anFN&sm3th<^QGEka}n)rgHJSw48T=$;%$v`N24zB>0^Xc)abp#iv z(hnbwZrZd-1m`AI=e~@QG4ktZ-L@Q3+9y#q*);O;U7eZnQUZd+34I0dk_YBXJ8l>I z_nL^n;R>=1MHPk9;5j%Xw}K_y@&%;TF{xFnR*8i&s_EuJN`%nz?nl|6wD#e{{=+_S z1QqPC81y3&*+-ha`W=rkeJ`;&Kx_n8wXrlQX{98sn+@q zBb~BQy*o^*g5(vAitMlM_5Yg)IS=COW~9ShpdEl$qqxe`CHJOWA9tF~7LVO!ld4*? zW=&!47OM5<+zli$cdbAgThAk|_hGu`=}=akW>q~r7`5Zd2_T)|}|1B)*oOQ6jND0y+k#>OHOEmskV zuL5LRsiC29_1ZQ6xMq>0Z?D5eMUpx$=lu9#@z_#46&P*K5KGwRHsz~iI~3fudc6BN z6SAs>(RH5<*~Ske_~lsl?gcYB(br&(lazKMb1i9L=p9GB=IBSp@O{Dt6n*jfi;8;F{A5@|n&i zImXKOl)P8>^z@XpX`DZQ{&7k^7&-+gl8<@=Ay<(Z5E%UfXi_i}qfFmc8WXbAn6oOpbLp9-qeXasRG#~7#9wi{J_pO#kWS{gh)jTgRFN?bzgi(R@gRj%Kn@+uQ<*#5nWvz!+_hOP* zv2zb&r{BcksQZ3tQ#(BDI>|GGB4A&wj@Sd;eYKYi@EhnE(fMqYm6fd@pe?&4WwOiG z@-Kp76f2MjfM#U1cGWsb8o&YRZxd4X;bpdz^HXly7c8!zz>Cw@Y7~J>xUkOHi$PVr zKE7u*k!4KdQubCb4K35snXye-(|K4TGWWsvLHgXz0^*R)sl~-PCqS?&sG)I1h|qcG z+U38?Ca(XDeoQKkdLqhA*+122q-pB$oMYiL7XmnH&DHDIW2Mbs|43W)hh+QW&I6B7 zsjlD=wZArWRR)^)jsOU>~863w9i9V8o`&?H_7DKRlk zbWF?W(NZSHq^Isn?%7NCR055W)<$0k1 zQLMf00s8VrmcRFr1Bn~45~t2Ry9d*w^~905T@p?C$w^5CtE@d#f8e!u1a|Pzk>llS z<`6n!YuB!2R3-gY$=-Wse?)12KYS-Ccpoti-`zP@Cg>b7_%x_b*L2y@&wC()@DO)i zfBoBg%jD(PugdjY^eas3Gd

    BOI+6w9lx6S1n|?%q|T1_QrBfA?tl=g3IP~p)R(* zo>O%Ff!=Vq{pk@R#ihy9PlZJIk#ry zd-C@?=?wNr_s4MT929r%-02O?@gdBl_`!hHbmysZACFj%z{z=XNqRvZ%;tB3gZu)y z<9Jskbc({)%V!#O5WLK~M+h0K_xG&+t{X&k3_~-nm!WqMPd9ZAS$FK$EcFpDiz%V}NDZru(N#e1}S-=kjC##-3kUV-rU9YF^ogDhwG)bW0! ztKPHozgr|X3Mvc5ecw!-0(-)d@U_}duL}~UVF^fpub=@MPV4?+Oua*UQtmj zVB#TQoFCNyM81dxi6`*;dx1^ULqz_a>BgEG z4M0p0dB=`gPdo7e`w zZ&R;pYV2x$B}eS_8Jsgr(c;nN77Z01q3PYr&Gb%s@tvQD5>!xfu-(Gy-HWnFLIDsC z8W2twxM5O`hAVQVNOFzReVtT~tPgOQ`Q;qKcDbP00vKXP+`exv3%-9t ziSv=-D(~o&l-@P>w~>yl`)S-S{ok%EWQwqoE4tNopbxc!>?oqQ=}Xr4EdL}q!R|Zt zc3ZXa=EGCF?sfr4JL3Y5&0B9t`gR$7Esu~iadBZ`L2X1dcSl}f;VW27BY)RFMmfiB zT>=x&Kec%4=FKGL97vmIK{_9?th{&sej_q`{C$?Qs_YKI)vVm-2E8Y<7@sH>RKv&+ z2^SMwDi&W3rSMk>$!8nsptf2o`|S-Ioyv`8?UEyxs5dOlc~(?X((zl8T3C2#rQ|QI zXt4|E=GLy+tX&1ut21;s7(<>nH}jxXropmf_|OW>mk_d!+@w#r?w_S-H`{G3C=*AY zoI4D`$P8s?bB)LGrS2QT5+U%svWmXwuTNT^8l$t={~Do6*4_c@@K)yh6rtnv;h^h>{FsPg;w zrq%y+>wRMq!~)diA6Ts{7F~+@%z^~+9SLxVQIM!`M7*>%nws;N9lI~pU8y4vn=EOS z6{}_^j>9>^BP^U?HU7$r>NDL_kB$&>GC-y5Pt1e@nj>M3H62lT-rBkj z3JJZkQUwylVVxY$g0oIe4NUuJliZ_6PSJv*i2mh?f5FTQ>I2B4fq3`S(-p2cx_pi> zB4(z*fsIa1MuSngbLrt~p1jfKLOXpNMh*f^!%ciP(ruw7na*)}8}7 zLNx`6ILXt+bkC8t2j~q+aRKuNwJ%zeB}|A>2vgM%I$*wq^{9T*4N?O@48hm6W@2=7 z$3lJATebxMX+C;*OLnqGglz-g{{poyo6(C?bth4Sj|~F6`VP{o28%-x6nw`@;V~Wj z?EmORE7=8~@AO_yZAqNz+PWhDYQa8}y zy}k~nM`l7yKU=`w!C?$la8#9xqRcFSX@LIr(NSGNRj2zptwR8KS@Q_f{kq(M?^X5n zt2#S7N5{waW9?1N&8ssg%6+4j8Hk;>B8M^>5kNoKj5XE!hnKpp=+)UkrT}cJH=n zaBy%ie8rdzh61YfuQgJAAOgrUwTjic>w&mfPyO4sY)mX{Re&8XV8oI~nF*3zK&O!B zKj(>}{{W&E_|@3hU~qNTOH!wbwiAf}aZ23)KeQu}Jj4;=0I{9HnV;`#>lQ|?y{xTd zJ9kr9J)HNs&)#ufS?e20t6-|45E#_C(<_VGNSXrhrCd0<3?e{DRH_Wr9EZ3#SsSX0 zi_4v#3Cg}jyAI<*3R;^!e*9r*(@~sP%8nTq7MyD_9{N&sO>!1Pw2W8oKxTJ7ORfV* zv!9sm3HTPoH+f@Lh<@A5xZRcr2sr0dTT|f2K#z-CPjV3;?aTgj`Wob0Tx}VbgPma> z;YoHVB$oxm-veER1{M|6z%>l7m=AnRY0JlHUq%#ziW_Ws90t} zC4ozeYo2Ps)qj#(?W);924ae+MNc)aZFCuXyN!5=!}&$$DfnQLH;3(MAE5~bd-@4b z45=;&x8Lm>8p?d<26GC7NZ48lpUravy}hEuq*8fEn!;J9D>Rx~4F_>Vsn`?BCbIVi zs+Hj}LRE|a39k|r1GDPNs=%)5IxW;VQuSLe?^yNzJ;%9oOAA`iMijG8ms|u^+pim~ zJpm-bk7NE4Qh7nEvEP{5+XNM1VgJd-onIP&aCdZ*J_X&gQ!>OXY*z7UYNV&#w!ZfK z_N{~}07_xN!rz96YGM)+hhJY8+y<4*cenrsgjo2)>kz}3w!mr4N500;N+@aUgyOLU zpmn_LOMCke!|5Lbq_>O~B)93Mx0jbvquLlaxaQI31kvUWcNZ5*oiJQUgS(3&QrA*= zwv>(B2ubVbqO2_QpDRkcFZCm6pVj)Zkx`X>7z+pq{dUH6wg)1?VABrFn2<#-Ew?lV zXyfwb%StvgTFaeQ?sR7)YWCJ|@%{7bM8YBlW&W{-JZ*?{5bbqMZ{Zw+iI3d9v#Q*s`bT_PJCdf&Prnev=n&R`L@64m-TnA&6xB7@FHcL}mTd(S* z6gg`GPtItGqr%VQj=4{+*-M<@77c(H>Du>R@Z?5LD1_23-)JJqsnw!cEB^cNvyX@! zyK^0N?kQ(lN*4K9dYaH)Z{D;?4BF#axp7l~zhb{x8BBw9lVk+Br@ILHcEkKlt>#i- z6t%9d76rRa%MUk{VR9CrXBicSGI{p8+UKzHi1!2PFdq#{T}|O>HNlvWp8JoI-l5T+ z>X5Lo<1~vc^|wWp=?vh;5jPrUA1oCu^<~QeTPCYO*z!v$rH)L6G+@Z{QnqaoCVOj z7%tS$k$<^_(-dvPIMgN#DkwFz93Y#hWDQO^uOOzhx~3oiiIC8nYeJJWwY0RdC8MD{ z*Hmg&B)IyFEBmCE_=eSF88wG8nerzh)(P zk?l;zyVXChD$|#kZg8lgpQNI0Zufk!O+-YMG}`Co<{rJ&8bR25`TW}#T{gl;N-F2{ znH^g;Z6a9#q2UXXDrKzQAFI1J;Oinm_W>qfZ6|DEB`kBjW=?5{BA<&0>re$&J-%rGiQKf`%|m)ud^NKbA$tO-9e zc#`)Au_lRWF%ovMqv;teiSjZP%crD-ib8v*C|q1O`eHNTC*II7gu}y8)>tVMKRY!1L+W{tmM{XyE~@a?>1i0%EWM9z2p(ZX;TV8Zs$q)kkpt5&b& ze@<{D>^aY&pMCmG6C;~Zg@B>Xf37p1x*j`dHkk1^Mzdg-aon2x!d#Gm6tU~b&i3~9 z%p+g6ZTPoTj@c;;4_nfYZ_$4mUue^$UOts2eaN7{m_aiKWKq21o(_l-KEqad(KK2E z2Wn{uYo$Je!Ous{2u_sC0K%ESYGrKhhm{P@{lJx^iZb<@x+B+o;1n4OK9?k4cL8c*-RpndhplPAWy(FPVB z`IY zUY5QLS60{bbsIR!{jRS07sF3B$i~971W45Hz4b!)KF9l!jqK<5RVw?65#$&O`hL~x z=Wg$Id-V5{FsGB@!~Jj86BsYK_ni+(B?yU_p^AyHVjUY>N4V7Xr|e|qW7@y!$V#es zKR3cZ^uah$CS+XZ!;l6e@a_wN(MkI^!>jA*8Sz^>0B!K-*w_V3Oom!&F6Wxq5zeeK zU;Sb$#-2O64foTSnWbbUR6<}` zF2AZsg8dcKJ5z{oeeT}llss41AW)^Z`Tw0H z=cqAmOUSVBn&7=Bvrk`*lXqg|dwe#N2w(1eh38+$CuJEj@MehBZKAk{pD8WSj#n}S zi6a6-nV}%H?jyRRtKccVA$uL=7FDBFEFV>|aPR!jOK6eENuDAq<9@|i6^8444w5pOLr}4* zt}ZlYLy`bE!b`aHU^c#_kQE%ld+l??w48_y#G++Th?q)VZ&(OxvMzDX5SG|?2`VOh z@Lr;l&6=Ug_rn1Muhv##EiuEU%8#LI93?%01;W$-Ax>IBBOegK%}8GmIoOP8lX{=B3^A8@@1GJwI5EzspyZA-;ZUdL zT4#&Zbz=15&g|UWwi6lCJiK^jz<2trbMAFm3I3g~}XEXW=R&a(;^$)Fy{7GET6Q@P%lfaDi($X4At5d~n@N*3J zrjz={pRPcCZnZo;ZcCoe$7LzX3;}d5-F^D2s{TOsJ_<9NJaAh4ZrC`||IcB|YLGJhB9- zNqvJL#bGOBCO)0R7%_SC{G5>f^s-Oii4jTadykQF0P71$VnSuX|6)@77`NJ3nn4~F z?lS{6syG)IRuSqhLOh@UU%aaY#f2#7r+V;G0gA7XT=8nhx-;@=4J{BaPNL={*VS7j%1=rIwEBSD9)kWharz_T9b zfTZ@ML6vV$)g3zCVHd&zbc5aJ@szz%***0g9ZEG*;*N6sd;JuUg06rymiFt7 z0&pgu39;1B>3>Dd<#8af^gh4pqpwub3avt7wQpFtU3lIrgQVBi+7!3L;ahFZwo9u= zd0pc#*EPQ;&yT-cHhG0vjhV9V;mV7XrMYv9iw2&O+YS|8s7UDhqH*v3?wu3whlk62 z-`^w0$!jfSDsD)>R+u#l&wCW@m-6soB7VJ>lhI09jgbVi@e@anid6ie?%I=e=v`|V z|3-g*fAE7(-`G1H*Q_8!cI>zzWg4E>H?b$?KozgAaY=Eps(8%Bkin_Z07u8{nXEU} zZ`QT?HQ)1Fd6C$-QD`wFO6oPQd7JaQb>_pf_FAY}>X9 zBGieyyHZk9!vgmxDq?%p=L368-(jB@VYFZ3&Jrv11U16X#VY6Z2%3}c9!$XSZH2~dH}!W z=~DNVe0+THVz50twiQ5`1*U-8ad9L81?K&RR*Km6?aK&WcnsucSc4cLi@w0A!gn(; zF)^XAc0=;Gl$>mS?cYNTFv7>p%1lmNojG&n+MgILQF3hdqf=9l z^N+}Z19^@ipBrFVU||u9(JHWRWJJ;3j|w|sM%Bu(uXQOFoe8*YDf?vtY zlZLgU=lgdiJG&vpSF+d&&~WPTndD*6lavn7+pEL3jATa+S4*Xq;sD230)I)t^R*crQh|952cs$>ws~{#$t;6?P%_t~ zo(LEi9Q=qLtgpme0+@n(F534{)y?q<}l&|xZT?3=E6FkfWHH&;=NAo zMb|nyI%I$Dl$L$~O6(X`vD~~BBq^gRh<_1k68=`aoiv$KskkN>OuM^PA&+veUcDbj z+{oPG+&Oh*q4fNU@o`t95?9eLZEd!1o$Txmk=9~3YHi-;NCsjE%);LObg#m_J?C}I z14>ToASy)>s@J>)1q4u-DiX)X$9-!0QAR}U?Ccc1$1P$!;DoVs_dlM5A4AjD`nsV%*y1M!@`dSGi zT5xbM|HR|-pX;7kw41Yg5202$_~K7alIV|Qk>rJ31& zq}t@vUhwZHa(op`9`TunOTT`jFA6b1gpf6@BCZWUf_sDmWK`xQZE0blchF3{OT_6* zeh?~~HAF58E2~tO?^f2p;g-22(vs|KGA z&#gOhc54|Dv2AMzS6A00zyK5N5DOVTa6McUJSl;K!L=jX>tDRMI6pt11`+&Q0qiA5 z(QcQUrB;*#j1;zJ9Ioi}8Iqt<@9sG->Fwk55vlbcyldt{SoOgV4=$-wKf)b$%>$|K zDArMKbv788DA=JDNcUfKLV}cJ6CO8njdyl-wh^w;+V=K1;s#96UiR<~9zc6zW{4Sk zZZ59G$bJ}G!b3wt2YEII1_m;!!Z+|(VFdiuiJNW^YsM!hH^KB_i^t6zj#nx%9xB1$ z+#3rS+pnc^TRVS+eSJ{q5vc} z$EyIfeH$M%C(Vb9su0anP_6MWfX6EcIj7Eh`T6-TXDb(`Pkn4|PJzt@g_^e9b2YK#$vjj7NJRvypfWUq8?5Tc7#$JWQy=PAo2DG%aQjYma$;jM<;<$CQrfrx1pZmq^c(7115~FnF z@h;iGiFfie8i*lb3V})JWFQEe;d)pP)8XA+M#p0Z^m0B21qUCqu-N3#Q5?y}##YM%uM?5VU`^QN(vB^CwM2{~qZR&#ycwPZ0B|OAHKP%nc!y^SsQ4lUtsQ&O$_0_Y6 zhJ60GQM3F#Za?1|)zAD6TslPbV^kqAV(;G&4gm#K^u0!X{s7vAj$iK*A5hS)hFF7D zTt+v>1h`R!Ux|VfX5VkKDWCp6GwO-z)$1yVi>1~pRV@p+i*-5J}#WUc#+Z7 z)3ZfWeei&PwIedkh!Q*`4>7t+j)ct4%zQu>SMXlm=Hn)5bxcL@zf(F@V7(8mo6)Uj zzJDb7czJK=iKn1sQmKDiD;L!WYe;A(em1;&mx3}C`tI{l-Dd_RKeO^t@glRcC2>Sc zG243I{QJ<*A;KC%ll(R81a8peNa5y3Am<9-`(jGeWv)POA?sI780w{+-!32#W6Mrh z0&OFn?HUwj7IIuYkBLbd|);X9OpmjE;?RGYsrB@RkZ zh`Rdv2bq~+D1RH2l*-x|JVr)FUR6S>V<491Cphtaqc8(iH8xUkE{$%De9qfQItXzP z*!X2v0M-z6nAoog4H75*`~(5bLXeX{t|bC!l6NDaXGm1S{|3D98sj4$uxdW9K79DV zMbD@X8WiJZjgE~q1Azy9!?sX)lcIux^~*HWE`@-3g-xqhR?LqdB0iLIK6>;Bo?=y; zSW=DyA&F5IR>B#I(c=ls0B{n{FdFpt_jkkOqweip(lYK~Z;#Tw6UOBA$ko-*q|c;& zPV;@GC!CD@23VJrmBj{^)^j8`Q)}xL8#ivmC`UD(W&(6@z}eZk{Q7gW8+p(vfu}^I zq&R(id_FWa0iJCnpVxx`{0=*RVeIjesEE7QQ;m=S@F2Z6Z-}GjsITwjO26K!?=>j2 z;*nwv#@UCBPPVyqK z2f{(T>W+3W+(N|C`YqZIj+z3o5p>wO5?_0yr`N1}2G6Up?}(g$f-`|Zi);_yHLF(> zs@fI!TvghM)jJA7jhI0OAZ19HRF3wIC(y~so z5p4W@5!bK3#>u}9FQw{(qe?Bj%J=+AN=r$;gqu%48 znc*i5qnPTk^Z$*d4+#z3ucO1UcX7rX?);X|Q5L;FG9}Nv%k&j^TVKyaj9s~c@Ipj3 zXWp7I^ff?c!oriI<)CXyUf%{*S#gn|X536m$AR7;ElmM!?bm1??lS(0*Sz5-N*n1A zK`GlujiiTkcX!hf87nT*Nn<*0f*cOnfkrQ{rS9w3axFu-Yu+bz=N(Jd0Jla` zaWOA2a)!sLW;}5ra7tf{`=}Uqvu98M{yxt$OR`t~;UPy4{VCeAp#8e$Y_{vy{C{M9 zcRbha9{0~IiHIU2qe93kD>HlVkTOCNDkBZMDIv-VDKjfOWm7guGAqeUMx-)Rp7*!= zJpVk;>vhh3?sM+bkKgxuU7zdod9O?Fco_ChWPCi0w6yd>Q>93cxZr^Uc^)-*M{qPJ zV-tD{T4b9^$*h|2n-gFRR{ZGL*uBWepxIeB=AAo7(c;K>PcuW&bpr=aM&F+Uf`WvE zot(0C?LlcBE%!?t#vk2rwWVv<(evj;Bub%C6?b@)}#|i{1Me zo>iFT7%0gQ9!3WVbfZMiM}GhuL;!$-{q3i602G?%_iY zXn5?fx7nJ(V$-2(-%L+`OOH<#TriHIYPNP$ep{35X9kkCIR0O(?8@Po`LNy0yL$-w ziBYJtdoOo_V*r$iCk5f~(S!w;&}{5Mv>1|2!>%pmby!phH;7qwsQTACT~hMG>t6hF zlJ`p>M?ra^Km$xl_IcCL+N!-LCF?BrP}_X2t3ywxlRqATeX;T9@!Tiu*50c`Kuc1> ztfbn+kp!WBVX*O3fWv&<#MhAHFjHzhF)`t-L)kMSYW3`EQM1OU^?XH&W3<+xoVp#?srK9!N3i_cMUy)U&**$*LaD0uPu4aK&Xr;>_&u@_x4q z;&{BTKG($3Mzf^kPql%yO`W>l9bRbOiLu+at+gNhJy;H6Dg2u5)pd9TK$rX)H=z_V zFfhO%x#Rr!v{kNYxw&4o8!|Nj?FB=~62>l_|AAgST4x_X%P=(lsi2?SZ1%FG zOH23Q++67l{Ciz#W4**cQUKkOy@XM@hA1`-q%sZYIUDZ^?v4dt3TagT{=aA=sTfDl z*fw3GqHTB*6qD^5>f(AWJAYLW(=+-ABFAR^^TwaYeOG`h-X0ybhiyQAP|5o>w5F~n zI#E(iLPUD1ThcEUR$l1cB$!Uwe!Dkhxs;4d`T%HBln6V zN<~{sOENT0cdk<_xgoZL7-!KtM>RdrCVqRFWY4o>VkxQZGx#m;bQhc`=e^rVOr2=8 zlkp1Bk?j7nT)UBhq!>>y+;N|P$`O+yDKb1|{MP+D6nz87NKVI*|P3Pz_P#Ar#X*3MtnT{Bw`@_ zD8zu>^jkUU>4o)qd;)T&#X#vHHVk=W2n6rW@M=6m#D)(9vkgZcK<+xjya5SLa9CJa z!(yr1`}_D%4VW%zFc94(4iA_2#rrsw0;&u?B3Qz3k46|b2}6nZw+}H0|JejkV&@Bje5o~G4rN& zXBwKK2)UlHKuZqnK-XK58vWoe$eZ86FwsC3-dj*yEU{(lR_5lA-2FTzqC^@P6p4z6 z7dZ1t5yp~=%L}P}9v_@uKQNyBb~iLM4Y-0SRPkCsAB1ZGNk>OTsge#+`+$4P_Y44w z@tE`$o2E2B4d)_W8D@ER9Nn%xW%YDT1GGfq41qEs!{9m~u)LojefJzZ$OK030R-4G z3V@x^zYuwLt|v4nCiQm*ETtoWsO;i5wx#*cgbWO(WLg4th1WeioDQc(5Ny05`dzx(09C8Tn~N+Kfk=y&Z`MXP^phcD6C)vIV4y4;h{2@pIKfEOZ#jZi0z z{)U)P{o+L_APRYG?OlC`OwaxpZ@s)a5r0Nv5^obG+#+eLl<>=3=zy`eQ zfk-hRX91>`J@WD#r%s&`lawS9CjkUe8ya@{a8iIsf-0X0bI=Ww>;?ef?l|l5{WyVI z5b1aDmb(ZOVR=LgPE=9EEWB!I*^Cv?j88!@*DY*;SHKUk9_w-b!(^9Sw2t3p^ zk&%&y&tHOws1RLZ(JlvIn1mY(l^C#>Jzf=CCpmgM7Nx%{d$3XY42~B&(rl%pBjk#C zGx(OOaD$;aZfB7rYG4ud<)@OihVAhAwG0eA0PL^%R1jDgD#so`GXSl=1Rat90+DVc zAj5FtXCtexO4%1QC%e-C0dWB@IE*MOSyn3o$A#HY#mR}UIg8PyMaCVQbNa5YoMTu~ zb6C`tyNv%(Di-3e___aD#D9NF3|9YOjsKdSe}7A0-v9lt1~oF2|NGmf*8l$x|2sSX zedNE6>Hqn||NcZ#(|%OI>u4VS`=PViZEbBTIHtsruR>4lJBzs%aZYFbx_=NTk*5FNQ8XBHrtPcNvb>9H!CpLrMdj*wYPSPaS z6N!Y~NYCm!OEi%fkx|2gufP27`wjdgq=S^Cz5M(Za)ufj)TFwBft8zgI)G=r1;`0v z=10g6O3b;WAeEG1JF%t&;r+ou;!T!kkwD^-q?zPb?JQ{vyMP9QAONP2@R}5lJz~Jn zPs%Eq68hiQeA*hN<#Jx&9$Mky#@y8eSudcEykg+X6^$Qp=i>qYdj+#Mjp7mt*4O<= zqBb4o^miXHL-q6yCT-}bi z!>jN5X2s;>*fHO0X=`6SArk2JaS;I|p_1oFXeL7-!q_7!8nNwzmk)j4#0kg$W~`?n z52Vhr5ydL=N(4Z5$j%6%*XT0MMT?KMP0+0k5I$iFl|}& ziv!;w4ti8cwr;e4MTF-LrZa?PvWnYNq8nNtaG?gkk4kbCFVF$@qT6@x-inK3!GV1j zP25Rf4VU3ey_b|kM1_q$-t|PE9a($SAfGHtoWlElh$6w2Xy@>Uf{-7E zlPnx8hqS;sbYmNTmuq1)HO`wq?dWI+y&$WC&mov8-r|Q699R^e{wq`wwhJc+cQ77P z3Xbu@l9KC5Nu1C;-iDBV7>YbPK`$V?tcreRD}y%rm>RnP0iq6`;Rpp40Oj*0L{}j$ zO9P^~@8~U{%hYz2#YFk_;|ozF;-oG9fqN4g>}^C0goE|!`hR;c!qAm;2+f_(2Mj?P zBmo|tl)fvP;cV)puNcq&p}Y*FJW(LzTafiD;DED|G6=v(Hcr~HYDkc7w?9cw2S$AI zBoBs|2(02~`Dv)$#2l-6w;r%rqr^5&1bV9}0aT4yBp}t>w;{gOW}r+!Vi2-jt3R-2 z*%I*4jF7N!I=Za;IvG}z^FT^YIXT_=85|tUL~{A~`~Y%$EaGhiW8T5h+qnAk=g%qF z@pwYiQcKq%UNb@)U2T5>As-&AAW1ydQ17SEZ}L%4Tz6Rxe4BiP?yv}Y(SqlCA!;h< zaI~x2Ik1w*k_qs*?+Vx?9g;I9oY7|h9W5&LCE~*Zur38BvN|)c6kM!3H*xtXlbj)1 zQ&VGRW>zB|x;s_!9b<0J^4Wht{bD2MIkBbzRh$7tqzi03JMekA9yW_X1u>H!tF9a* zns`)UBJdnT^#G7(&?5XqZAV`U5GQAaHUMeJCmkJ$h#-R1Jzu>TYuSgfm5dPhxJQ>j zvGYGIz=aExB&2YX!&MSoQ=>>oVuFH5;K8_oW*~Cw2;`=RFU^JF#e>9>lA*_0O)Lk< zFgzyOb`XORNsCQ^%jEiMv=5RCD!U`RmJ@d}U#kGVlvVv*y&65K4tRWcHwEx=Jo z^F$KZm5hWRe8#%7z5~!E&QbtQtyi2RZK0v33_=$hYBgC} zSjdf@6_Y3cO(bd*J=d{5o(`=10y!bpdG}xl!4%}mTz=oLY8;5q%@z8D#<%S-sF^IN zj{osXzXt3jfLD<#qZ4_0aW~o$M4ct$@OcRCwEEA|M}Y2`;)Wg`MV5OA-o;}LGvQ_u zXksHQ&zNFpC9;9KQYxo8H3Pl}Sor6A{kC*7!Nw`<@ib)d-2PaJr%kR5_YI7{>4Fo; z1qwowhb;h--TSUCwEAUA(;u4Pn`0so%m-3VGcOJ~IXW7FfOLJTpYsN5orZ*z#j3t5 zDEj&&0SIOLuAIlfg5(9tAhh-c3pRM(n@Jd`j?EOKZ1R(yq11=7O@I+54l>DOL|@{t z4HFPT3E+%-cyVd=7_b~(G^kk7q@b!Qc~D;Uh~G_P3PN%(0XQO$nUtKFxd|r+MiUMN z-zCi7{O-_bhkL(Ckfddoa#Kb%$NuIbhGbyT<^DQUEe8Q&C`@&?s~Z%yAW4m+eC*h<(w_ig0UXwU`osyG*vj>gqeQ$V`(*(6 z9)vrl;SY$YD=`ybX^2zo!c3q3pePt?tbHCd+-PI%JD5nwj~I)y4d@cEwXk4819QEk z#4F|p(q9*_ksRuR$OueQab=i7E2A_}-6eOmxTuJX1bSRG+l&`eCGZH~tG>dMr>|U* zB}jcJZMc<`xc-N`mXsXi(|Zuq($XS0X`%XN6-IBu9)OJmWSAin+EsA1#4Ju=W`ty@ z`~A9CHI){cqcZjQW<>G`V?5)#`xMT1-csS-`YfEVsYc+nQ_ zj3yC`>0NNv)L6k}q;DMsH5-3@5pcMTAXVCJDlY6NjM306>p1uzXQbiDHT+ct6}S+1 zJNR>0vXj9!841SVNY}u8K)4x*T$q`|z>CPT5^5Wpo9jP*{GGkewxD{6jpzZu1z5;E z#wIN;DS?<=YZC=$0vN{Wbt*q$iw zk|si6emFF(qUtL4+7hHYQLw>ZV2!BzVbJ2jbcC7X2%@wI_c&gz4`(nQv>k{cwvd^6 zZ8MyMfO<)QEGHk4;Q{i*=n(KD)+q3k(N{NV3DBqrHV@))1mz?y16ssgDYunWIOHyF zMK_>=8$$YuaRlGQpYa2G#?}qOjEqEtcgC4s8h+J5zw3X`8#pd4N*g>EEgc;ue-RcqLZHpObwY$NF@X(pN1P6DdxD%i+9!uX2Bj{M zOiQ%LMKE!a>K=$YaBn7uBOJw;1R2@W6XnGQzlk7d%lo3<-&D`G_G4o~UO3Oz4p?0i z+96OPJaO%K2ZU|{;5s6Hc6bko;_)psRXxd{0679AhfZ4hg^RCpk9mMhx6Csp<1&HvF0kr@cMh|s3^CpJ;j_CcSurlfs`>wNd8@?E#P>HJ{ zK=XLu@WsL%pO_%VLpySDrz`uM@}NR2JwZ~H*bjR2Npop^qsNoNivbY`vAw^`CjAx z;-0<}1)c{>UcFL@j)~#n=idaEWB$*8Y$hG=x>@er!)$seCSeVRG+`1v@k zkz^FgYUxT~hDiwiBvDxv6?YRki~x5CqDmm9&Baw*NDj_0ya%iX7@A6@W}@3CzV_y_ zAPp%u&p*!qAq^1v-h_=u1^a{m&e6W#i;B9AxGg*MG)Ld8;&J1d>aL5w>gjo=Q;z+K zhE@jlmQ?Xiz(H~D)Z+jrQt3Q=pg8~m6N*+()zHvTPY)k&mH8wY(;_h_N{SO80v@f$ zm)E-pT@q;+VPs2!%SZ{mQ#+pfIM|rq44l0rAGzGc*|&!y^!Vzb?IK(XL zCzFgM>AZl@C4S9L^jFBcVOIU8bf0ahq<8M1+taM9%}8HIPkrufKz3qk?;(viC~m=- z5ML9idkwU#9`ki8YSS2(2t%DpD~dReO6ME zyCl0GMS*Bx)Xe5zKD9-rm)rYdDhOJRKoWiVDaH{3!ADMts{CZHr0w+C9|!}wdi6-@ z!RnFTsug@8Q5C`*m>8gmtpdvse0l?l$_k%wlpPv`Kn3R{6ri#`Dxqm_uQc`?3Rgk<&VBgCFq?YdPC}4z| zz{oOyeu44^Z{1o~7X+7QzIU}w#(hYev_#@eAsflrss;pJ53Ux(PO@NA= z@e~}PWdc0=Nahn-n!)EdvV9uIuzKy6r=XN&tYqJ$jNoGo{OH#qeF@y~4?k56!|P}< z_*1C&5jRF3J%}s;czLsn2SN9q0SWocnTU!`~HoXjz&qNY3U5E-`K~<=nt2i)#uM}=3J%_*-TDFP5lm-&>6IB_T=dS*qOMRjF7r00J~rd zQMS%Le@m#=2H6>cLPBsy<4KS!qaV#GdWb>f_z6WM>Nz|usF;iM2@}H!O0JBxhjG7tm5MXw;xbo>vP3&->44b|1vdvFN*#ObOLVT5S&8UD=JZV<8s8)CW!P;QMh*te{vFnQy&J_OgNgt&Ou03T)Nq#>F|HjQ$SLS%mk|dO@uS--xt91{x_v#Z>Qr$CDb5$(j^X&Ey>V1yh9dQHiic9 zuIgCsPY@>(iI%vuU0qYt6s1VioUiFlke71p3j=L3 zrpYE|nQdl*W&;D=9YR5fx4$>46`uF{`s-lMP3I*Lex;2qOGq1Q5%M8ponrJB?3gpU z|3ld6E^&}RFtf4g5S$6teH40l*n(~W^qTtmwMRC8noEiF7Tde=Y6&~V|1?9b{Y)lM z4ogkuJWWr(fs7Q>dCaIk1jNOQ`q*0`^2(d(ea;E@R3=)HF3?o_`S~?x{D@wcEE#%s z1rvFUnO}k4pgyNB`k(D++x;ANP=sK_#zLgGq4hfh*y({ZJ}`yo6*P#Cg1f9gL99>h z(}iLwh%Ta&lB_ewJ{Vz9p%2OE5-zL(p_K*5XyzB3or0w{rTc;Ozz$~$k}@ey)^mI_ z8MmrxJvFB-@ioF@>UiwJBu$)1o@haCVa<9S+D*pz5~f)6c?fZ0aFlp|@beRFtu&xz zc+Y#dl3C2q-C#VUMVSGs@Uo$S5@O;NAV5Uth-ugX41y$m*dO64OvlqF_f=5H)N9`V z*H6r@mMPP`^2cD42-tr3!5?hqU8PL7pi!zbaI!<>sN#g&$L7^nV-`wFfrmt7$J;eVs7> zJT<7I_ZkIpB=Bz>zxezbHz73oiV=)wr0!iHqcj?_vx&HB1$$c%=IntG5m_ zYu*%J5-5AmT8Du!;fXCZ5LzOlc`u@shV#wY*5>%NTE@XgmnqFk^KfeHgg z76cKS<}L9hE$mT@fkAMqZv)EXNirop9c>Z&2dr1O%#jaJB{yUBaso@FfX@;pn1ns` ztgexOr%z&(TtVl+vsp!uagoCP2L6HX!fXT>)_cBtE%IdoCe9jGiaz^~kG&G)x`zthvm8JLwx~h0LAGj+&cH*|O1RLSLXy zl}wmon2H4G9sD9W+~>4V>MAnr@_jimIq5*$0SC4lx17;rn9v+ym}BhXB`HxDxkvv) z7y5pck*^f;P8A1)P2@yLPzC)a{Vf>1H9{G5-zPea!ftO+1nF7!7pvr_E6Zd-bflVG z%hL{-6m}dztt1~317K;S>TnSR0!~h11inkJ^rLOy=iS@xIEG-V8-vycenV1BbaYn1 zr8qJ-szOF(Mor*%*ogX}ly#J($0X-f=KkELm#xVP>FbV=j`hCI-DF1=w2kzj=irnn zNyerqFfP$(yF6K)8p-}V2M`&EmAO%jcn}PVlkXOoM3^e=TwK~tOE~E3VaCCn7a0?C z6Vu@^umK`76G8<6+|2f)+F(o2SyI5&oE>EUNFPfky^5wIi?8qfIXJY_+qALLsMq}4 zwHzqq%SZ#;j&?U|8q*I;lMa$ON~er5Z%)3{SbLQ;_^e}ZjLyk(=L}H2uYqu(yeb-{ z--zTFX}9pU_K9DT+hJrNEM#3jH#s?pjQyKHbXLoxm)Kd42ymwgXSuAE z2>tQhmh)jXlyu*9e-o|pCeouFt>!{mRw6BIh1zX*78@<9Mg{+H(s4hIU;{naCw7a} zXYTj@9qqc{_31cA2gQ8@2cJ=q&?t1LAr>NLZP2Fqr2n~6vcfW6o}8sMzI$3)nwr7D zJZY{Ojp-^B+`VhyJrP5(dDC~vpuc_{jpfmcZvDV*P=kOa>O-%rV*(i?X# zJUvVb;sGgNSXz44od<9M5)7#@1Me{E4cP1M{5W@JNoa@YrEqqhBSGAoxp^^+5%VzI z0EBc=kMFu3UvB9tI$*UkXR=oWMMN|pBN+FTg~Dg=_7E8^#uVONm89YMI~J_ZF0GA_ zj-Gue7o?KqvN%VkMP*bM98>f}vyI2@sf~}aBy|{L@Mz$&h>_&SO0wRAM@KleFGU|C z!4}uJXGbwqE|_g75;}E?pXBWcDj;3xk9F71FexhGB#znwHQsx041t^0a<%7+Yd5yC zx`P@DotlbBobJBu+L{?A@9*7Kyf%HT==+^GfAv=rAqk0hSj_g?PmE-7$YLrS!N*HM=j3jkK#|x#- z{3XSbSbw;`%E-;7Mb`#tnKECLLR2_}+yVjuSr5Y?DKyL9)vYz#Bk85KcMcQaQNhN) zs=JkY1;;E%8_Hf=wg15^lKTtD2k7$;<+zi8T*!@Jnt#bpb2+38sK-qN=JJCh~s!5Mg+mKAnVm{BRS5_UfR5-Fu!K(WG@Lw}EyiGXky{o}Jwqq9#F} zBi{6S^^iTGKX@eRpH(p}IbtNg(X-qmi&)wXdXDWB@(oP~F${5Aj|@UWfl%b6M(AsGEK z_D*U>aPyvR%r-1&T8N_!IA@+3ZwU3<5JMrNrGQk?~Qu~^UZpc{}yOPKwsRNAs79rvMJFh5a3Bhav_*I65ikLkFlEm@Edy0aWl4uwnz5l~R#OG(LfZB3 z3*82Ff2p{+>OAQ;`Kr-G-1mcYH8jvufc;x_n+Qn?pAJDgMpXTCnmW3=jr2B9!lo2# zxM_|+ZVIs~l*JVgh%@SByxXBH^4K#>L6+!IzwZd5=eTv+Q?-=2PMDs`SM3FfTSDT8 zag=EOs>{O~>bJ*|gb4ip}iJ42F3-%q1pV+kj-$_L68u(Y52{Eo0*1k z?;!O2*$Vp$)s>e=XufyDSk)n#LvUNI(hjyD=FY3k@m zPC+^4DUd0Ej>@%8hKwIl&dBx&JfcXg8-gzs)N~lK+j7?lYugW?1Co3eU{LWd{B-;k zh-YNGuCa~zt%rjrR#tt|e(G`-dbCTGRY+Y2OqoVs26Ev)56|2Fav%n$5TMlf~;%b3NWr zxUZ?!0>RR`bM#O1gvB@hy#Lylu}5BE_xaqecbHp#{)miL#+bqH2MBKlFV@d0a>LP=*>qDA> zT+-k66uZTB#ZX{D-n#nwJ0S0xv92Qme1JNWSSjv3IEQ@(O#^{1yb8x=QW-BCsB zV+ySMW7>vLhM=W0t!6pb_ms%7rX<}ZJ@=G(9M3><=;>F0KL1}P|DDiKB3@%ip>lLE zyZ*cL`?BCCAC^BWMMghNZ4+4h#l2a7Qu;g;*N%>kWlUY(Syp$%nszt;;Rp_@4(g+L z&^5qxAVe()qWY@iFI1X`l~Ik9Z4G0br)DhHb2B#qsn^OdX=wcXQt9M~jNdQAmiD&D z$F^^KB1~n2u8{h^?{A-OAyc*opqmbsjjo~#fjV-)b||uZRH_?z{HJG04HROOX4ULC zBUJ=rQo$(z>QyP2vvyy9`-rdTp!kb(4*f%)J%%IF${0B*aH8Oj1j+b>Lc)y=%*HI+ z%!*84ygQ0;uH=d;krJfv`#rt;C1%mvk3NivO&EpOKfi5gZLay76gF7oun9BR@%fuj=iC4pN|EDK zut(;h;U^0tx;pw0HF5|Ypd|^q?GE*LnWWdtMcvcfF;dlkUu~fc{oa08!LoRkJ-TfI z%8YZ{wJo6}h1AYK5O2@VM}iod7QH!?A2$_?udU@c38ZP#)z8M8zU3+W<8WZZK!vx) zww6ZePZb?X3KiAO#Sc=PoSbD9G7ie0hd{7iE05omZo>I=HGtxrTBCfP*nW~5NsK+{ zzTru8T2Yi+bO+j4bd>01eGcfU!Hm#^!HLwn;*}_yZWvAzwAbvHJDB7V*g1|cL`Fy7 zSYMY|dhc%3QKF*BEzdSTy`*@f{HE;Fyt8bJOR+IABT!mF0^wiRv27PI?)tE9 z9kz2AfPj$mvQL@AP-ThOmzWW3M8DLTsP>70$_?%ss|8M=Y@zqxnFui75&Ve;8+8UT zk@Y>&O%9`FUCv!q@qH438)E_*5s7P-XmC7h$%oAw#lyx-<9qGtf-VuMQhrPCPcPcY zpl}1GPJ=$ogm(+Mdy?v!*$t8p zb7N?Q>!BjkosSDN0-~b9ckj{*A9yVuFe`M~;g#$6<)8Vc$9BbUd)v0L&++u6VqQm_ z>QUOI1Ny&EAJMP()^F~DY7zCkGxxI zyhnh2p)-WGcALW<)xAv`@|WbStpuB2%l>p9j&N{yzi>m3&o%mEwwWNV5TLoK(nVKy zOwsz^OGscPm!aSuK6W+;pJSvf$01in>*Vw#e46%XwLe`w|OafzRh(JzQ40mX;PZkP+bE1(qBNn7!b5S@28w-tbkSC-Wr>up3wsW`dGq z*OzKJ+R09iGiD)iYkZT&i4_kL<41H>J{P_y@lE(7Faswhex;Y^NJ870m~Mg@(RV%g z@dFY=*nl;utKV0OQC(9j=;n#x$8DNqBq$h0hV7P5@Y|5SudRp=)Vk6lB7^-~BoVjt zwyVp7S}eiv7}452J#0I+Ysu2d*)C|_+&`osPVsZn;`!@wb=ti4N^IZArmVgOl=;0dw~YV=Ld^j7Wl{EhzU# z`g=F0S?gZ@N&i!C+<0gDdG_tgFqyN3?S?I!kKyZ{4)e|0H8%ui&4+J2OzL)#gsQF3 zzx(`P!0~yP9ltc__N;o<7p$zV*+g`ljdc>y;if2~8>m9?Ah^tSuz2}rsKCUY25}|` z9wA&OV@*UUg=r#<eczAlY&@`QU5fzi9fAJ#yvG=l| zeehqx>qn9uem}o3{~;u?;!=ulzrl^ZE#=cSZtp9uAE}>ja(iypaQqYbuh=glIv#!x zFftFBF*m+|u#LE`3}aVngAgHML~O2^9$yr>9(7G@VXPHHWKiyFdCaeeT>D=C0Tv+m zsq*4*rTXKe?ZboqpAS9%HaIDwXcFJh&;Z6w(bhP6*Sj#{TbZ3-9?m&;`O_6&_JOi5 zniUtO-K}5vZcs!NdHgiLZ))<%VrXzf#X8XSCZ(-}^}Bp|Sy_#xS-F-spM2k!`*KxY zoNxa8IWDonyVbDeaQBm=q4u3`L^c*!_ze76OUefeg2KrIBJo}JbF8qkKIxu{#&r!x z7C}wNu52`bU1b%s3j)}bM(P}4W3{|jXj8K3>b95Oy}1w$Q&O&fI*Hti%U`OOl{ALL z%dDs;U%&Ty_U#?P=bBaI+U^X!3ASP!K|$y1_Uj+R8dqCP-(*1E@4t2X^_O+UjgQ40 z-+t@=qyJdu3L6v>E;lL1zMakb{%!xASFNU}2g-eI|JGJNJzn~-@a<^l8%60!Z^f0% zN3L8vB2RlLma8&V{{H0{oyB{p0z5!F&57m3cF5T%q=J{YPXNA90qE=mhfmw_M(*RO z=JUO3K-FsTGP!M-?fUo2X3YC1BjEcsJxg(Dspib=KsK!sftdI zJj=GWhDfOGg=DDgL)~jA35jees8Lc(vfl{ak=QOH%yZygY@MpSZ$gjp5u&=Sy#KK+!8ka2;C5vn&+jtn#IdDJbm|~wfFSfyVW22wSVbyONl%( z=REh)St90?A$uQ5N%ky6un0&u3*nuiTUMAa zQ?^98@6@Guxak$?nbiTDSl+eHrNs4w`(&lIh>CW)a6LVEy0fe1V;-pEba?~2m|RdI zLQW-DWWI+Ua^=y{$oHc$)`3L1YGplY`|a&H7Sh)&Y<;NDpPzr1Z?E({NzKHp&}=vm zd8Xu-NHg@i<+G5;69Z14AbD_AjW)Q*dpB`0NGJF+Ga`yCXjj>-O6NTz!sfNt8gw&E ztUY^$U<8>m<DXc~itQt2|hQk0J5@f&#P3eNkgNb&}{M-%rp zWgh)tC&NK6QBtgJ*vYN1=itV9>O16zlXfN<1REr7SY-bGEt|A6&pgy17qNhG-#?t} zmv8y8d5h)B9&a!%AWi_!XCV*HrVOcKQ}^HvOKQ z{$i8ZBqgHoJC%4-)3@;NyRp|c)yE}lCLnZVRVlhH36Xmt;+mUbtWg)gzufmi6gY*T)m@2)J5^6wZ-YTSx5GUvuV65Gp0a*xWVMz zZ#wcK@01p+&a%o~ny)|i!uD*R{IHZ=?jcuWnk!^4Mk{!wShHpVUbgF=>B+h`jhTJ! zCr_PR9S8Rdwq`f#?=)rkKD|gb)jHY=Q|tG>K-ws_hcf{$^i}L}^!*o|k8LJRh?aJ=T;-Tl30JEV@~)i_R;ai%(l_+N-E}_k%;)FcAum|G*Y`>+;lsZD9@H>P6eNyFBcB)bJ&@>w|37y%G3mzJGrdQd?%K-#Uva#Z}&FnV~)S_b#jU_uN8(l z4n@kK@o2MkKY!h4Zd&g8U*wnfvP++($baA`8T~})Z==X?&f8-~83!4ht@W8iNz3}h zsNJYN%e2U0d(h$ZL_hZAI}uzBCw^I3>Ei5=ZFa?OTvlx+7{iuz-E}h=Ps0wacq!wg zfglOn{9;St3)626QdhT?s$A=u(Y5*B=Jwp^+Tq=jdy7-IXm5IM*1bGOF=)NAsq&?r zd+h@ZrW$D@Zx~f|tFJ~(2DYZ2uPJzUghSDd`(kC@yKiwe1r`*X-;gX1JmsFoe|{i||fbs1^qpJxm=@LKn;aIL@#Jt5*$GSrCs=rN@F?_dQeEJae>(^Fh^=vb){D_DMFy@Qs zYO$A9m?zba?WEuOJ#KR>Mag>=nqtv#HJ#gnDJ^9jFx<{H0~JEKd!kpBsmJd8`MG4} zi~N&#{9lJ$xzAVqIjJp}vvZ+;Bu#$3!AtFsTRn@de|^J(!Y23NuD3=J{@L!dZOm`LykSZ%Uq7jLll1H&ifDU_PhTpOi$&S>vkKtu-o6o z&<)()T+p#;{`JI{#ERINrr&cN zU}3>)N&kAMuJ&AWOQaT!@xBHYc}UK2eG7fb`><4jEJ>H9yKEkR0#>p%J}LR*U07FM z0Qs0%{4I`YbL5kMYiZro^9;+cA3yd!Grz4?$N_csNZDFUfn`WE!_ilL*ETto#N%?z zhXxh}qlXzsdUAynw&XXyJ@Ti>DlcYcQ-)REFk8F9W^{lI>d`(7THzY1bb(4CcCxr& zZZ2_7XtZRzikYAxsXoZ?p~0jmRY;t9?#Z5(liQPAFSr&Y7FoWIae0jXCA#Q%EwP(; zS}hoHw=qXrkx5kuUX8|B_yOsM(GOoe{&;cwCB+)ERtuVnjRq~R9jR?AJ057Kyu4gZ zH}q?&SC!W*-0FqP-B0J5opR-0Se!Y;w!=tOk&`E1Z_7DU|NN_;ocIqO{9Q5U=48I} z=Hq9)^gU1Ik5)02ef~b9!#( zO!9FZCKjT!|J|SejP}cWH%F&aF~8~Gf4fM}Dce;3K*q%6otCyfaRxpAHlS~BA^{3$ ze4O>uvevrkuX4-3^VJcn!(&wP%fHg(e~#Dhn|f!#+cjyw+S$G_mT4QHn%iqH5g$4m z+&H?HufW38?91ifv@W`H#9f9}>M}`GabvH3;vqrB)pt`yw&m$wuwAH_$x5?4a_&i1 zq`tn}{it%a>!735MmeXbq3F{q=4{d~bq5UH4w1W1{QpGcMG>Dq{>4{X(raSRYztcBfYSh$)y1ayyf+Vz%FUi{Z2L{nYN>Q#B4f zm!HoMs-DY_89b5uTSoS5ACvsAfs)q&KTbLgDv{ZKE~XQyI@2yyadFC0pUXGrR``P?N$vC&y`4F2~mXVRW498s5Q7Zmk;n)+;tmsarlsXS2 zD_+{VtL3&FHRDdJ+@9ZERmYevkz4MnFv1WknNL=Fli13}H+E=I=^Q<%_|4uYTdKlw zYMcD}ikIcjrw=+}xAx!>}>ob?;JncWp!EPLn+wM=)UPV?CwPKq5KP9TM`$%fIfZ#cHTo5(T~ zthhMW;$^6GMDR|DT}?6E>DPLimIc2q(#bT$3j=Mfq|U==oU)gO{P>G+ z_J1`uEJiusw6*a~H|(e9l5JWR3OM=u%_jtX-%s*i^mx7{f4Kbq-u^#sr_&q;id;n_ zI{$79XP30YAvGxLC$KovTmHpvX|^l8zS&?q^96^{tzSL|KBX>B4Y|U|zFw$U=d!5~+Gnez&9oy`Cui}}kBK8j!=D{~1gwo5|5cG8muGb-Q-5aX+~o2% z%TA&fNHeSsYIVH0yd6cWe&&2tvmBS74z>u}-+*UsU17$1f;~3UeFvKpUeWJ9tfHmW z!E5o*er)vEi<3<6Wo73?`a-l@XO-MN`)Km6VW&Sn$@0q(6b2R;p{O{7bq|Y^(LsU8;bCr< zFOOfoe)EQ>*?I4tJyEKfcL`XC{;xN4)cYGSrt?LW5Bx5~W2e6Uh?vyYhZG;YAVp2h z7diO(RkgO4penwddbV{skbT*(A}+w+*Lv~Ga(KT&fTr$U{!vP#{;Np>f-Ebtf0UdjF0XVTr?H`P!CF8r7|zwl*{ zWtY-9vwKnUSP_|*I#V;gJr48`QCj~+Ymk}i^AzY^L9MclO>4zXmJ4|`zQ42{`MB-0 z%8MQ2H}E(#O$m5*e0-|pgqBu=qlAOlR}UGS;qO_!`rU2f4<_=3v)h=8{r*)EpZUml z^;TdOUldKLmA{eazp9iv7=^24U27+G^UO{8c^~3{|5Y*Z#MLGM9S(7J;*r>krxv{f0U&M2C%4&6OZFpj;)nQww-SqfE&(^id zb>FmG_qK8l*0!vdmz$5iX-@0+HI@^8*3)~2T{27!r(LPlr{%a$n_=(1Q$J!U(sBHD zuBrBBHOcYt+>a--zg{@}ZqzW^cOs{ad%r9dR@->6s`~rF1w)PSL_H_-=NySQvJ{7? zcZ)u7?k$$pTsm9+eD(m(y+<$T##B^Bk5+ChR+Y=)p7{%gpA{XMZ4azOemJQ<9U$g0 zt?E~6pW*v8Ymk+cK+?+T!cw?8_=h6t^NbE}~u*>(HE48X5 z--N2+amodV(JhylQzxs3Lc33=hsC{jaY+6U5S(?Y^e>Osy#F#3$ArG&dM}b?;3x zxRl;f_?`d9KBTsm%hqpgH(TD50m0y*A?jP(I2+JB33RPhRntBFv*`Ou#Wkhh*LtDs zl-O6y7x4{Ae?x=VBrmT~fA;E~iA}`5dZ72j)Ao|~+(_G-HyUYOSwV6cS7<%FJv-yF z?jFXsk9z-lfhn`#;~OjCaz{s}oxabPd=C5bB}5J-u!%cXDui3=OzEKQXfvX@LRJF zIjZQW*(BZSdRmubbLvdad{wBcxTl=KrWDBZZemE-q*Kzdb4YdTRbChNI*EnS_a%6V54wL=KAJCsHZjH<2H<&2? z+;vF@Lt4xt*>_4KiaNLF-*M(FOy07+^ZLSX*}2|pKHhss zmsgcaJ~cl=74^J4?}Rh%48yT`iDr(7+?`(q7_jXsHMUOVGpZKIe8v;>YL2`*`=La1 zW8jyi;L*$0Qc*y>EeKyUxbGsVVLHmbJNqWiF*{lF6Qb zm2!J^oXn-T{MV?An+=P;^q+d>+C6=1)44ZZhVC;#_r6}Rg7)p}Kn-2&;0Y%MiNuhx zW(kLBH85-k7S`nDFIZIm*mol@A6W>pT62=$K_BP!^HQ~^?mimL4KfYe?z7jZedTaGh8~V8qmMK)Se-B@HdP1k; z#v6MTACos2?^>jN|_SP?fwX|pbrEhUf$7~Q}TVo#Tvdws!?xu8~;PrK!eXEl8m zdUAqQ3f65XXy`~CKg`UYxLhUySQr3$GoL%Fy+hK+#a8><1dilSW4g%sKL>%BGRi^o zN!%T{fK2uX2nfI&?1oVpX~|d@a$NB5&i3;9}1;!eR0Qb|VH9;@=?Yat+I{#S@Yv7u`NFihW=2LWjCnqxd!}L!p zQzaOQl$0$kiOVfnT+CNXPEZ$%Kc{Bvwe1LCjtqF*mdS{T1b)_!H2<2|!r^sxye<~r zcE{Wv0o^7b({QuXKr#0Oifrg_)<*nw&NHSJvt*vwM)~?;yv$7fZN(Ik{p0Z*sbRLL zibFk0Di;0MBtdycNp!75En0dyqL_Z*8V**{y}yFC-T{UxqhS~lNY4lS^pQ`slax0s zK^Yeq6x7AJX#rh3^!95n=W<^^270&RpJO0M1Ez$MV7eKMu5?Fo-)?|OkHfxj3Ou>D zH5Qalr&a1|v(+vx$f`S4Xlh2b6DBpV$Xu^c%HD}hp4vyUT*s7=J-MgB6%D3vFO!7j zVPYRloVK=nZkjxaLrs${Aw)t90nYUZe1!G`0Qv^YlznM57h|6qH#JXCO>j&J_lxuvKWZuUj)mmms=hDy$wH727GF6-?{~5+0MxZ(eD1?#{~*cVm@bf ziirxAetyt*6;4oWwZpS7H+^S$@Ym9}VTj-#G#o+q*QvAh1$BxW9GSUOcL%vRS*w~rKnQuf zYagRlK@|ozS)V|x9aLwBH_i4|G5_}8ZR4l7uW~lT!hrmERj;M#Xl=Kt4%I%gqUPk} z%9<{HSm)|WjExRU@-`+WQ*rLFQR}S5-_!O7{D%X6HWtjtgIDoz|{{|uqoyrsWCT!5La!o*FAY;uHa&6Zk{6E?>81gIN% zR2x{3<@w&LM=FsTv)lP2fd1U45GaK+{m0-c(n|m)Igyg=z}7E=tF-+5vET;41yin& zO!RxeoUr8cDUFufX_fBNrR4nwy|2p`a+G~-;R?!@Qo;nfTE8dWVfZ5kHgQ8Uj!4t* zqHyRUv{_LEtxt#@o(KSL_3>Ru^Xvl_mWbY^T9*!E6dogXt8rQwR`w4HN(1mbsw+G! z45bj;LaKyt?$5#>s0)n{AvwcFX9smLPx90?Y&_gaub}hCMi&~)oNV7dWsl_+ZM zInFXnKdZJpF6I@ZCHFoIxkDT}urs>$$n7lbLOP$>rQ15-tv2N&nY!5T ztH1NdEz$Nmx(XV#!|U4P-32gE6iLg%R7%fsuP-6K$xW)ZN1(yw7tvX-f8HZR&TH?d z*RHU8q89=ah|-cU_is#jZ??>E@+S&Fa34;eVyz4%PZ2!T?=G2JRXPfApLRZcgys{x zco4&7?{p6Kbwn@-`vOL9qF^T#5?Xf zi>f?jLt#5RJu7xx9O!f%yI-z1s`c-Bg-c&9;aQT_)2NBBNER&a{O;n(F8&r<8O*;Y zfE$CRZ43kq^ne=86frtWX@{(FTZj;Lsr4D ziK4^12aZCSUpH#fekwRP9M+rbm5e!j?AhQfkX5=r$3|+;0g9wIi$6cfZrK09O*hrbH2ZLrvt8HDNr;zyqWuOBrSf zoh~nIH^&q}UlkZHe{nJ&Qc&5&kmTHiG>Cbb35F>ZlShkj+pqgdr60n50~oRE)uNrQ zuCQf&9VA@f&z(8w z5QUUnNN%0t4a>9lVgAtbEl(F#f1~C2Z!!Uls*vHYa`b9=1KGn77%FDN*#y9a2_F4T z4XXcjQMYY0j7iT&H+n|-huZG8@cuu;+S6+$Y2-ri|t*xu{f$mF#-V{UBZ0UX$M z_S?DyY{R|IKM`Fq`&5+C{9%vqw;xrw+`gT^`mzpDcd_AOM#UKm#ZgpSFlm3T0~aWA zp20X|Q76PUc!3vgA;dJ5o1H9X>!o|F{~7g7a_(t4ZzEjY`X*v4r-_8I4Vy73LF~qE zKVwQz&5_y)dg0>FR%?wNqF0jMI7^74{@RnaNo{`en&{rL=GZ@=Pq z3+M5kC}_b3shqz>f+`$Vp+07a+n4rSFE4pp(qt75n?LCDoJN5d&_>T+*cb za=iMbfSox1LW`M@E8&Xc;KlO+)t!V|| zr^E-P<`c0;rs`Z~T?sWiUHHytr)th6);Aj)pUcJAou$agbaya%p z734G~1=#Nr183HPA0;Kd3RCIhlxVU#6+BQp4X&S}Powp?@f4e{wW)Z0n;WY|p5T*= z%>?Yj>>AfuzCeJBrqX@-$?FFR{N?kP*G^NKb8aI+9l%KU!0+I-)mf)g!&Dv|FzEY1 z4IM~ozX1lJtnz;qVtxx&!-pp&F70NL!PbY4r>CRS9bH{Z6-jjxIm4VLzY$vUGWsK_ zy@i(DAx}TNn5Z!AhCuL2vk5Z(6I)8Bvzwrk`8n%$O2x>c&b`=dchxCiUi0e6X-Q?q zf)_UCmYFZWzzwU`@7x~_Xn6lD_PH;pm@|w``M03yDX|M{Bs|osLk|Hm3eT50zvgzI zLahLece=~ONNe5ImT%HG5SlDbyIOWcM~Gn@f0`Yg-kbk39c{5qe7p}W@tie36ygxK zkn$>Pg!AoW+Eeoi)I)7DE@Pb05!)@9*NkY5p@m_*jG@xj|jy4__ zcke3eL#?vG|MJrZ*o5t`4BL>ra?5Zon|)~P(J_lecPzIp&emJrDq@>u_Bze-l<^y% zx-^^L9yRxlnmRc+Ji$gsgOYBl7c4qqPiTHe5<^#;vs+@D7QCx{`SRXoI~sE2HtXUV zpCgv1r3#Ll^h=1W1iK${P@~`^M1qD@tNgJ&^dkJW2C^S6J4dhcc6ap#BTY5r zdMkwi_wcEBuDwuIw*EAkQ6`96l{!1O&HY8A*VOC}rRLw5Q!}ev{RDNv>*@N6xUkm2 zgbg}=$V7YyCQerU{c@Fd+~>BrnXjdGK1OZ-O=daAA*Xr5dR4qHM=kp7yU~wMPvY`^ zZOOXbON5#B0OU%KfD!~h#I(=3K+w^DCBY``GxIor- z=2+=l(Hr4dX**nHFCH|D8gra?_*Q+I3)1!Fe$hmAf!(L-(8i1l6}0a53_Wo3&Zq+K!Qo=1;eP++{g< zbXg?_ac5(i2x@rOOH)m6^G>sh`T2F3I%tC&))O2+{+aii7>MDIl->+AfM9PP$Cb%y>8c+rBthqy?YqhU$n18!#Q2pLt20yR|5`+i7@(DIHell{*5_?@&* z_ubqEm$g}E52KQDc8WeR%}4DaAH`KoZ@$>xVnciTu*6cmd7ZXJ*(QZy3S55@G8?cP zRae@eqmPAF?$P?3u9Ifj@omt(U)|P=XD_>o0gH3cc9Q`0*LEP{yuJc8b>vLP%mtiEp?WSqUG0+I+zC-`5)5m3b@n)&J zpwcNjZ$uET;Y7;Bnu{}aUW&W!4CxM-O10nE5?-n&lc^V5^F8#r^X(^E%-gp?aqOPQ z%PwuP+L5!%%T67Fa57C!e!BJK*Pu{7(K8d1+!V$?0tK6|L0sgEUeez*BU2OTYK z%cb+{*MyR(9GXhs`39W-qZkI>+>J+PkLj*C+!BNbA)!w!Y>iuRy**___5(UQ#d<9Y zS4~Zhd4slu5RiHmf139A!ZrPQSk`ZC)_T^zgdp14rF!OZ#!E`Jdl;;=v>d>6oXl)i z*=uIxET43gNcH|sfeun4F(f$HhBu+7$X%7@(SQ5T2meI7(Mu*>_mV0uuxsaHqn#Ju z3oM(!Wh3cu( zjrrX&QYGcRKfSy=W|eJv+zh1QhvqgGSPX2(HtlT*FH%PeZ7=EF=?2KXe?AFN*#k|Y z<=#Qep28D=>0<^Nl(-lbXkC)@Qy`N zKM0th9;?=PT4;053Nn*!VAYxWSv@{!*++n*u^^ksQmoAXRd;(M;pJ32)Rd-J{Z(&AUtjO?L- zM4i`8Pgz+Lb2pgpOPsz;VHPi&uiJ)h?{)rJt;XxvsHpbe9O0FEzY$xc94aZ!1aL$- zG4!VCck6JI$`|cN%h9_};j#SpdIbNv^B|tY1)E3Zh5B7^onm6n=sKP+Yv-qg#LxN!>{P`&8FRrEv6-=owe5@ zST3(#)Sax6xn5e{cC3vOCn6TqB^Nrwem|Oc*;*(3@hla58MVrP`%IZJ(9^eNG5!g3 z2Q8X7U%$*PuLl%rZbxxJ^!S}_8XZAd__R!?Ks?FHZJd7ATn5_M;x7eapRP$%MO_5t z5|bVdSxLY-0O!#dlqolxZk<=?oeh~*Q+!POCMCAOJSow7eIU#dKO-Ev@TeY&mGP>F zpc8_(@_>sg&hc7J;QyBy97RrA2{uyb6wtKT)0&u+O<9}p9R8QApUvqv~ z5SkQnym1^^R!daH`1bwml&Ek$q=03#^vlKu`N(zA%?KISo(tvYF_)VH$1~}p_@KfyFd$BT z|Gt%8DkCm+edE+7BcXb`U0?G~octXrR!6 zsP3JHLrw|W2G9Q(Ai{yoneT-vOASqWc>R^=K``N)GL9bqnAf}b*eUbTZft(z`!v}}^~lo(bWCfF&R4ERej z6lQC29v>a`gPQo&tHA)l$0ADuhWPYU>K#<|XWRxC%JpKaP= z03M_Xq znKXDQpxstAfxu=Q$1~lkJivzW&Y( znsOyTmJO;u_13BCi>=2(1g{R_F{_^1EpNHNpjXx3(Rbwre@2tsTCNhvc!}^|Ot99D zmFwB1-`e=9>9o6ByH_MYO^uQ>>?o!5{b~9mOyr3@8pn6-`;4W5%07R_8-#xF8{a;% zJL{Ph0};5Q5sXFRLvbnp{~4II^*Yp4WQey2)ZlDjW2Dgs1B18v@e= zL@}0^RO-KfcP(VK>k~K#!@(9edfDYse|NB2S>ft@w8$@QC6h5P$h-ahD z-@ux=gNW?i zZ2-9}4ScnP4>2FWPn~WT-VRBu3(%Rf>=NrjeVC#(bqgJb3Y?O0TpXvIo4=ebA8AT# z9fKncov?5s#CCyTzGK#+2Bn6nTfuMrMH^@p|5#V%`xv4Clq${fVt0Q(uT&r&uIaY! ziNCO2LADBNw`4fZK!NNF!lqxxlsdh2Xd7crp~$QrK2yBoe?osWBTteBSTpcnW%6YN z7Ofm=rT;4!{FGuik^YoWc8l+a2R93~I7c~(5GH*rk7?I#eL0#1EB)6Vp|h~hSG-1t z!vb3^ZLNb6>NZ{Lo2jhKVSG>c-E3YgYfLP!{NWoBsq z()wy`MdA0uOqAuvhgjDjfK)~5!9xUj^9HsobekZU%i1471}MmGN+VDDl$rCoeV>TaMCzK8P748YdnB5&z?_3SwLpn_UQHW5MIPWSg9&ty2OC z>P=mKm_)sSJ^~UBgP4E6`x;?MoT~k;ba-5GRJT}-@fq%G7>m9CvPKRVaKf1%++5UV zIWCwsG?}Uk zyqC&QY&*=!Z)c*tcBm~Sl_s5I1{mDK8Q)3u2VRVc47fN$H~I{?p(~e*sdoDG@neZ? zGd-YDuPR>h0@PJ7vFo(-P+>4z!b<;TlJyiC-*UwD?(nzm-Q$AiDk|RrU@b@cDvghi zWFeVKu%2x;bwm{u;5?3Wl?L%*_GBCnsXX)9lG9SRw$#h7VIVOf>~Tv%>cR#c&8av8 z;4*(-A@C287^Y9>Rr#gDodJs;c-&&;R9WZN9L_59Kg?%`o82dtsrl#SP6s4%GVjpA z$oOS=2e-P?_v`SC|AYi8`v2VTJ)oia0xap#)>I$}d1ROGnw-z8m5ygmSlP`l<$;v& z`)UADW!qu3`5to+eK^4_RJnmtFI=@gQeF*qn2GA_VaYMnzDGu+;(>A_kt%mtQ%|;Y7oF} zi*7G^_Zx`ky*7_%kwupOCr>Th^uRqf5}5=+zQM#}@X`WfPDoEahq zAD(}4M7P(bq48owVc1b?^y)XzUCyt?i!d%+iTk*6=m;b36nA~~>cqwW`^2H9rMVSST|Yab~)ZNdoSFXi#O`bEEb+ znmYTuH$>w9rmi$_RWAMvm%fB##&jl4T;i6}_`>?ZD&V>SX^|~8r+}OAca*%_-P@Bx z*}x;9X2N+?N!_w&islGE1!+P2=1ugucV#87S^cH$%S7wN`yj8NqB0-3E{|tT;8!_s zxI^M8g%O^pM&Of%1=X(%u2V-=e z?(U>FhISZW9PZfa-#;HFvQkgquNGEG)Nl6p09t)aVO}lI5!03D&S>~bROK$`?_%7j zQDP0%R7@}vfMDQ{kj#NhIEO$CI3@0PSSrJZkO7=}v<9yDBL;|OaN_L-Dh zhtt!EFFsI%x%)(_(3^@muUWt&Dj)%}vN*uM6R5{!*Ovvyy0aHJ!~#b4-mV>5pw|)= z#rxklPXnm30CWRKETl>WHaAkGV(+9}U`D}jV03%NPVPMjz>j<>E0d0=U2D4@m%ZEH3onuz;NtZ651xNBB%Sgyo_6iz5s-R)Ki|`` z#PX!O{2IZosbzzs_2)q&sdf95$z)ALwavr7^?GRVu>e8-c$sP&(5jn-5g$eWzrFvO z#Um8~mY>UDycLzuuV2FPzUf9JtL8gelyjp--=MnvuXv_iW))U*3nU=R`(A&SVzHq4 z#mRb~Qm%u;jhpDW_lczx6%jl}CnZLQ{Ge@ygU|mKPUp3H9RIyCScjl!7601r2OJ#O z0)Q$E5KeMIYp)*-2vE+x$o#qq^ezj+5_%Hq zAQt_2g>kn>f}6^Tq?J`(yr7NGPke^4lH&*Ud2d5nzjeZmL<}R8XGg34jXw%hJRE^9 zxL97io>Ph67%$F{D&}49A7j2aoA_8&HBvN?_8$)RCmXAf><93Q;u4xwziLt#1g=Tzb#8zI#q`z$BrHLj%>+6w`GR3i0R6Wi6O3xHgBFv&4 z>O^pbmApuzpKJ7PS1*13@XzYsexhTq%bWv~+sdBc99>U*y%*Q=Rm;?QvW4OJ4Nd-X zvE{E@>|8vj88M}fmxNvFmDo@@{H^-P;)6q`vT{cOd2X8I2IIT(Pq@7$rr7#((>=3L1ui#r2hcWoRa3Jnnh& zXhRj*7(6`rL;eC5E5!5)ae=|cQOR!8iHojMG@>W_M3q!$xW`)}Ezx0OO6|qvUAyA- z0TM1wYT%xP+Mt3kenSA96+sKivoKFK;UwP|c4$fylN)jLOgubyqVhG`W2~YquI32d zyo;V68Rm*e{2CvxflaZC)f0b?4>%z)=WjG8J2R%;cb(`j@9)wKP&5#{&LhUAuOkl1 zmSRABLiqvpQl;x<^$^OZu4oUCrpyeYG_TH|hkcL#BCWjqLoKA|O|`{cFVW}Tc)TA$ zQ0PSIVTFQGZzy^1UVNNg!)4zK+`v{G#NVQ|W*zngcT#t2XYRWh$<$5xdFy?dpGKL& zpFytA@1JTBthc7yTAORFv}=NbL|?ytGG6+*IZN-nzMA6m;Tg-DLi^U>gM%Q#+;<$r z(}YXQfsKqYH0mv)$%+~7V1C~YijKgmq?j1*M9OCN_D_$#o^3bz49h-duAvlm{BnsA zqC1hHIxUzwVmj{Q*4#u*7-)r3S(1(XR^ouKdS7?^{@nuNkL-~LGSXQC-lNkM+sSpc zWQpD{V78nrye*{3TiI7u$oEO1Hrv8-peAAnWybqU0>wEBHu{gwY*Nq079!8xyhpEI z(1b}XHGm_1`%3@KPf2 z2=IdZk)wPk z^Xqn0;0i>-G-uny6@5IXj2A2>>X9dP8^p#7Y+V_i#(z6DhS+)PPLVs#PK?cmk2$SSUUKB9fNQ3mc>4Yt{ST$%%?ID6b2R3hVa^aP{u$Vuqc0iy)!*5G1`+fSmPPQiE&FHTQ0@%2?usM76~n>y zU~N;=`ij4Dp_Nf^o9&ZImwi!hZz4r@M**?GX8bE^+U(oU^($}`ch0nqFtxU%kby>f4G5>d6LjvL%sDQ zqpfx4z2{EvB^mEpy)6A~1k`_g!m|C7wE~XEo4)necOzx&QQRSG*h@5--(52I4+pWF zGY9cr?*2nw@|W02y~k@~C7GW=%Z$u{v9USCLt9y2e+cQNkWz3xR{Vj!E{J3Jg88EF1in`dt5o6v3=%r&}iF2#IX$cnis9pzz<3Ck%2*KUd65_$$ydyN3 zhAhlSteU(bSebX9%Ll9Hd(qW6;TWr{!FX-5YKnAYDSCy}Eqk)UcDX2puH zH^B!FOSr82z0xnj7F`Y+Ey+1}aN;$7ErC%=w{E+5`9*~pyqU(l8*KXi?F6|EvvH~! z1y(fiC}R|FS8s27^zA{*69WV+>%KY{CuG9s>E5EjcX?HrxFSwO0@b3m4`ddv`zi{6@|+jlP_QiS+w1t#QdfP6>01zd+qjgWxVUQ zGDC=1MbGBPtt5z0I-(Nvxf!U|RQ!^H`EBw2yLSofjK2XACoeBA4OV#^?iI+bf{iXl zD2^DhsG4;4c=inPRMNozgDJ4t7biS9nVLnTG8BSB0 zOUd>6?D7g<>zjYky{anSa9L#r)MunL_YD2K7B+bjQ<}v-eC6WWC77yq^cl(1kkgaA zn?Axa^}FS--wjmZ!;KODyQij^K{)-dS|d)L#K-ape0$cwfPh>YG042&HX34tctIlR zX0Yq~JvF7-HNX2A(9&Ks1GMs~m`0uNI5voTh@U$T@i149y0I3wd zvdH6M-jrBXef^p~Ew>EPGZ>NK?6d{u7h~1u09CnZS^NYkTC>8IYuEN@Rr&CkzU{P=s1vX z?d`3%!#QH9J~+wgfkfY=7}MVA^BA5l@qCXtk=a7?Sg5=OO>`89HC_;w8TbGCo0+a9 zU_PP(UlDXcl8E2ncn`w{etrgEz0Ym4rV^vIL?!I}1FlOz<5cUOJ@`Gde;S8)QE}l5ls>__h2|Nf-yyA4s5JGn3%|cSeeDf zCRZUz7|JXO!m{p10(|8fZ2ex*?AfMflIv4{Rdfh_htV!;e^nlIv8zHZ?VxMSCB&1C z&-qvgi0I)k9p~1lw!RIiI?rJyK(_V@CSV}Xlk8}7(hHQc9r3>b|0*?_MoSQ=Zq7EN z;I=FcA9D*7ToJI~-7q0b@vsrnsIx^Qx~}p0xk3yT=0gYvL_EN|&0#a}CSK6x9uB1- zGv@hRD;Co08}db3L0SoMD?u=m^FzDd^`};C^^hT?#6f?<4>n(-kP;U0Gb`&dJU+gL zvOyZrkt@SR9Q11CZUfb`uww_x5DYXI&A($$z9i<~Jvg*Z&da;aA}?=XXecVHjCSVe zSol*@R{s;!5I|hVJ+CNoywumIzXvX-&hQS=5OT((dTVKx#KI<_a+2I|``}ufkzZVd z%pOghgUu0!4>m~vydL-FtQzVG@cNovSb)Ugp$^1I$y(cgZRUL%)IzX@eyO6Ws+!25 zhn}I3bfsFdoAep{jIk;Bkx5Q%u(Rq5|Eb{th}N9tK40lL>(zBTLW*`}Y8s!Pf4I=9 zbKE@EFsk6+vuTf;BVHi-3ydis@`rV0djq5tSl40E-uCcg`1Qpu%lnfBk7SBoO~*}!B7LK&H9j}jToArxTv3( zsGoQfTaZpmE5n0130hU&^SMguJPpTmsE_oY)_i|~LP$Y1*6DX>AuF9k z?%Qn(H6)xT+wbgfK#&o;{SNNIN&frKoc|onOX&N<)i&WAW5b>Q&Ngo%xv29G6M#=yuZ042KE zs7>cLPWfKz38mp}onKP4#igaCNVh{Bb8|M;28gc)#F7}Wvv%0D$*HLiRV!ggZGI)J zxzhejv8wzw^h6{WcPZP0J2M~L$IKR|KO))Tf|rOIB0SaKlt-cw`@{p93)|qTh=(r| zgdy#f^#1uTfB)Oyv#W@_iRWJ`D6~_#U7*0CySO}_hSL)t{Vw?}J3G6Wi_rAU%z1F- zlX$EPZ&n%%QDhKsa}{Sa!1E&Mtwdu-(Yf`E%AM7H91-u1It}qIMiS~doi_M)F=)bW z!KI_9q~t*wh4qo(qlKz!I1{!oX`z}doUzw@A}sR<%s&9RBu5#YYhned+RzAhlo;Wsca0;@E( zPGR+^BFu_28ghpeIzfd>{fo6o(2pNYgnp`OLA)>ukS9PW@S&W#+ z=sw)KD|*`NI5WiVIJrjubq-g*ZXuY@4m6@*T6$vzTa{(20U}>dle3ghDRh>Wmgd$D)p;G($XT`Th&TN@ z{@v5#0mn^gS(&1|I@WNup_sZX78BDSKbK4N0$r_#^hZ2is3=CU#6*4^_^Bz|G|9N( z8QsrM=4aS$D(m7+3LLk#NGr2L4i}rNIhde>w#*v>-a8J+A&mpPR@pBuFL$@51P->R zsUUYM)N!#Gra&B?;=G5V zal+Vgg_!V$4M2ST$Pvw+4fobhs2*M=t<%szWB|ljA!B$`;fnZE+AGAv6Y;LXOyctF zh`C~Q?)wRmNIW$^o+0Dtx&HVsmG6hQtkejy(B@F?;EG`7o z?JxH-fSYl)dIc*0dhW2MK7NImYcI(VN_mZ&Xy&l^3+^?4htVUxK55I*f4PCG|4gxf9&9Z$^SRLR-1< z%v<#-9PlTuMFlVUHGa1SOG0_D$4ALiX$T*li(xqzEU(L&O46Kmmtd+xKx_zgAtNf0Mzi ziNbpl+I$AjwD*pWGpPt5%BQ%n@CO9DfsH~27=0?k-GV&e#c7n1ab*#OBsV{8M|ZvT^abaX7$ZN&r#kTbBH--g+6wuXcNj|Xen zAv`XSYcZoP^3fzzwBN#0h008)Y9?P3d^`;z;rreeyhrQXX1lPm@(kdp2&N>-AmZoH z0!E!7J}>lo8p4$E!Nb7{^JzBQ zFfcY7+?x6YeT%=nAm#40xx~}k?$hm`-wWJ>9xF9qfTTek8Y~E?OaZ_kMhhESfxmOzH)lN4;8AKknqP1QWiuHh#|n*2OoUfg^WXMvYTElP?$O&u zjVOJ4_>rAB$Xr1lL39HZWf;gxR;?Pwec4%v;rw7=AZ4}Gl?PUmxCy=(k(gYa9X}s3yX_-O)m!aWd5ut~X7&Dxha-(EL+1+q1Mqm*&kP{F@ zXb2}Mz)6$|iNG;hw?GsC4E=*_<;=SvS&cexV55Of z-0Axa`CG>O-=N-$iH#kAl+zX9De3ZqGSkz;f`fzQF6CT|AODecQXW!c-HVY1U(n0< zt!hx27(n5&h#40dDGA>sgH^K{H>CX`;0|DzaO2sRFA=iaA>oDpRlg7NP{grrpc;)8 z)73hkzFN9_30AYn_7h^LLmeecXh9#@F_J<3(bTh$J8pg*SS9fY9=-b*jm&YK>&JbG5Y3FCS`1oS5F=!yhGg~PYT|S1>8+@nL zH)3IAJS;2Aki(LYghUz)M#FF2_>valY1$Uz&523A-GjQrP^Td+jWUDw%=4pax*m?s zV|SJB7G?l3g+h7ZU~eB zxj>c%OjD8UKlCiLVN&9;v9SQMXfU>zCF@AYCGnuasTfK%j1s@Ff0~w=sm3~33kG!D zwi|D<-eNhPS3vW;DF# zB|7{1KC#pL3%h=FcG8B`kh!(ai2EMyrQAGH%3_}4H!?m_KfjK|t`c>#Oqux%cE0q(pIY6E&_kgDC`-eDSqlo=+w7B}Mh;#em8A z9>{eDa%wP|L+>Y~CZ)m7gM!!#6)PeNVUODPq1zKmz&|2`8|k^Z@vZZ_NR;4vSeO!P z(TDf%pTmr9Iy85f>LVpZhT=9XEKIaIaA9&{F`5_b4&|G6nlsgGO;$^xpE=Ur$6SH0 z$GJEa%VUg8(N?u>uz({imfwbn0!A>cZEPMn9mX8xWv(Bs)c-kMa-~Nd91QW^?A8VWJuxAYlAGvlh#RwXP zdRq$v0x)+^oxy+m-s$P7((Bi+tIofioSbl8Iz@;pf6CM)#HlcSCy^={d@UA3* zqJPC^kKxQkwyt{z2goG?II_1oz7^sIVg*B=Ck!4(S_X;xvS@N>d_ux1czGe{lrU!tfZ5xnc7=ldnk*gpx2_0FApw{N1ZO*ST!pXX=zG1$pWk!}o>-iGI8{xW<+l5C#p}mABdp|;rZVf%22feD+;CrEZ#RJgcVj8cCkDzcuKEP6RziR`2++y zzPpL}U{i~T)H4;)Nz&rE*brDlm(FcCK$|s>g^dlb@q12gu3$9J^fsO`qFvF#HO)Yg zZDwCuQ$84s$fZ$U9Ca+Xv2AM74(Aw4Lx`{M+oL}>u@IR%V{hTX-hoB0?ESh*rm3St zTEBu${J-$^jnOKJL!ITwGH&OSUXh@kSOGg$9>}|H5|Cq(fdLl^5!ZNr+l`Ydv1L01 zA)-%D9+(z(1RX*TV#Wp>1Sw%ZmTPc*f~8sdp#u*J^6)==8O%9!P#OII3XzbnS7U)o zcDCv9qbd^Zwl54S6&HQS~E;i{~HRkJ?2*e4F;-3feUf6eD5 zujM+UuIX(k-@SSBy;C%u<_+M2YHAh8j|PB-vz>&9=mnfs4uE7O=1Qv>g3wIdWBW%+ z#C^(;7v1p`NF@BTadV)HewZSopYB^NGaommuim26z%na={7m%B8!mr1os_ERuXOeF zI{N#=Qc_Z&HH~z%UKdw?lAo63N_vn)s09%VaLkGW-v7#vIFbRF>da}^^I$-~F)Qd~ z5b0Ysrj&MU~Se*mG|g~r?;IEfu^};U3OL-&EcL=)EnpJsxT%4@4a1!LW<+RB1^72N*)cVdai~pi!wPq!cs;0f$xKk|E!g=c6ePPE6Pe!v9><_YWy zLd1v)n0)%ga3-+hT+}%1IWhV(!~fpl@rv~9XFP%L5zp$)r=K0Jy?Z5dftA%v4lKs% z+FEFD{Oad2zvzhjVj8R!e;pz@QU#^8wUg`x6ICYU0BZqs*->?AF)k@d0p5=G$@%R& zcj%m)&M{jl?%x*&1_0}s&`4Jfb{it!jj0+4nc<3jS+_(XdS!Lq8aE7^)uFGYrQEBT z9xf|TQ5b-B<%5xt9DZByMat%7oeP@3c+-v7H#j*eDxWP40=7UfLIpyL(6TZfxHU>9 zC$@5PbL$G*oKE*XLWgzD_rH<|9uFukip$G05(MlbJMk=4XF!8Gl&>iO{|L@uEKP|n zRK;K(F<4pJ$tdpU@LG@tKXzM_kL)t5KJ`)}L4?*oBDFsO0h!C}u%})8(TXEF36Uog zOZ@Wxec`~LNOdW!r0Uf=6Fuk$>P<2+6SlMEXiKcV`TnCTy->YieDHU9USP6my? zNl=SaQ+=siDk@M9Mj0SzV4K~T$`65c#a{xcw#(DKLURJgj=5pFymp*noA8pf!2^D4 z%l3f9iP^X>_OB~aNtBd9#m=661&*5(-*Es&0X;V5+T2OhPGM4g zds5bd;RJ6Ow(4&TRX`|YjG(F}Pao|s--}F&n7vG>9G-Fbf{FJitp}xDn`0FA-2+Ak zY*<@wxPZ7~{Z4vtd^qhJ%T5hd+89cSlYdZYMw#D)p@&s$aQYYP+d@@@2%?P?yb>I|;KEV@(NKW4H3*y1^v&IcHzQZd%UD z2UN9gWZT%u-HmseFLnpD5oo5pD9S!wrk-CNJf6VEFxmAr0=LVRqty^yC8&k=BBT3a z)wKtAHfy(Cwa?c*EMn<_=h;kzc>RP9kiRHBF+7dBB6hlnq@DgZF>x2!oh?`hh>8Na ziOBei^5&x9@@EShw<&EaMRpp}uy+9K=*T%5F8TO#dCCSN+!^qiq%#Z>^0WcS8g*dd$%Pj zyHn<@v{<|F>~QIYi#F05&!l!mul}(kWifYEO@%}fW%cx!nzHRhSLgm^(3AxYGDqiL z*SL4zQr_OefL-ynnOZM zGZ(7wkKmE2A6V%}JlLI8YaT2oEe@Ux~;)q8y zCMhXN8DknRY_$OEMZC-HM5cNaCCLFYx3WBUrD0;Pj3F-fj`lC>i8D-ftlLJiE9SU( zxt|*r6VIPIu#aoQS<|DvsmP#6v-3(M=S!9gC-zOTzP|MKr+1ttGvk}kOY8wZ$BcnW zkDw~Go??N|>Urfie5{8LA0C4T7*2FSdGLE|=K8ea&DCZH!y=+0CPNCPeRQTG9d(R= zKZ7A$jeq`p1pU#7^06N({wUJl0o+BX8cgA!qvC_rv`ByyfZ*z(Ga}fx+*l1J0ah3*B%Kwtt{x zH?CWGKiq0sX5YJ$77n>PpFeC+8v)38#$vusv%Ae3jE`2*M^#{ou>i#D@mvXBfwcU3 zS&K>E#@ADf$LizEL$5F30}=;vJ&_bNfRgok!6hmAo{yh@P+%JAOa(@0$6OP=l3>l3{p^6Ekf>-pE>Scv_*3rn zmE0Ni?Hy15^%5Rtd4>*xFYT`F2hX`kGD|wR0cc|7ib0U$z~o?+ufFfzjS)&YZt=S<(GN-X>%5otw9x2)=r7Lq%DG zHnZ?VS)nQw7xLMolMW?^Itp35UlpfSt{?fa?N`a22Fik$KAw;g>V3WM@!FI#%Azx` zXU03;budxulR^CDgqhZ}XZt!<=I{-PUd6@EYYZr9aHD9YTE7bm>mRHRA-I7<``cR8 z!Xf~_7Tm$Xs2g(NX6wB6qm)B_=8PawXZLnu&3lmp#LbrWkBNzWhUS2R&bmz7*KFB| zeLHR_bF9a*wz9ILdh8qH7ru42{ku-$$%M9ny4{QVmfSPuug5J32?@+EH0dk%!i=O6 zXkq0$jJoq#C@0Yu@5z@}ykdz!gTML0qSE18~dtg!MMbZe*w|;lGx||i zpA;5`>4|Vdx`{l1nSNl34x8&@_y#l6z4GjwoY5fcF?d@Bv*MV~x$Jo`kyzZR7ynMA zt5|LOfvS17yHj0CAJ+;VANGW|O5gmrq*Gi6BF==4?sjUvkRH z%aha5(Fux(=wc{|Ouqh!&inmGC3c!G4?5SwIG@8R`*od4Q+BH7>;8|QV3cjlW__bV zb5GJY!O+l9e0i>gzSvmy_u_Vx5LBgp*^XSX4+Ao-Y;DhjbR$Fo;(^!=R;@s(%YOXW z31F@Se-C2`GIyD2RgVf}Nj+0j{`e&%W{-@F%}Bgbc<=G%!X=HbGa);#X1pXZVj6M2|~)4zWWcRF=@ z;G{?A9Y_ifj*1 zlN}=`VxB!~KgB?YbE^C=lvkxJ89k z|JL4q81z6X5}e+#F}+(~SIFElGf(3up5mOL$}|RbgkgcwSm_s#8+vqxyhukTjS;aQ zFi|q+@!aiDyEK5wX6;{7c>I5u(%vOCFKGFB5Q2b9oP@2WwL)X^wbRd@KmS%+yBF8N zS-4jQyO^0LbQ`+660)FeH@D;ykV=6A^mj5WLMS=$JEC_PlJU8lw>>?W9X zt+(GlF=(BuzJOg+YGrNB!}JAse`#6SHn`?lD=S-A2;g+4+m5P#vFv6R7Z*P$<#@)h z*ySZy1~x?7ozzEUpgj2yt)3++aM#@26p6XR>O40@($BD9r8PBsj$iaG0jDm6|wTMzx*Dcm2%HrL8>Qzb$E3ySwNYRohFP7Ueot$1i)Gj%_^o;5+&J zSi>PeB-I5XLSyuLb|W6$Z_-LoG5xeF22b(fjYphj%OA_XyL#&)Sh=|q{&t^+dU18Z zya{)U(wpF~Uqt|-y?Xoh)0G(t0y8u7eoBw0x=T@3e3ZqPE|E)wJqOacA?5hZrjo$n z?&cay&*-M6T?pO_`&x6X0@J&n#mCb$llJ?K*i&}KJrKSo1k$M6g zt_K90diQY%)zu;-OObtK7?+aIPqZ`s`SYj#&!6WQ8i8@2q*k%>3JU!|g>*r_K^DBA zTK87#8JOi>9COl=*VNWF#H+E8HZf`mq#(&xI^xwLFXC8ZuX~r2$RI&c{y2@7_!mM; zFaVbcKa*F4c(y$n^0~_3L$D%I{}3YOUbu%b(-4P7Au6G(K^7vSL|aZMLQ1}>CS}|z zQ|h34>^8@kcB6yclHKLfH??hReJv|p>a`K%C1@qcAx`Yr%yrGIF+Y7evp%!?WX;=C zv`{E;>9kecIAwVLeALH;P}}e)2;FUMy|^(D+SD4lsb1<<-l)oGtgiZd5aY96`1r^# zjCr?H4E6Or0V5aV?`$SL&PjK%!LSvs6-{Rn-bqM_CL4O69u_!s>*h^rvh=B)f|=2U zhF}*kk(_6tZ*6JHw5KGoaKvi(7qZ!o9Nq38gVq^1#PEhD184p12?Z4tdYE5Lh4_q% zl@U-p24};wmj^=p*KgmxzzX&uerx1%)5h@wj{qbqgJZr>2UxA{5S3 z>1J17Idgx|=PJ`LLLSs%H4#$Rhf*wVk#Z4I5>L7@u`PKvdcptJ$jK=x?p8hWulMvF zCg#pUziLLm-qyKclGYpwmpG@_-BcBC_VOO136}B^BmTR2=5t2HKviJ%#`|IAr%+uV zBPgDi)x>s_wTof8!d>dv+hW%Amu$F>vD@+BfdgbVHgh}%CSpgZ)z#IZruy*l#8XP9yRW2c!O+BuGS?m86)Ex~kL<;{!ubxKH3oGB zJKLwJm$N7^>*^_W__k`o|S7PjWVvGyPM@@ z*x72j@863~{oylQ?(`BwkJ-{xH?M8A2`f^nLG<+`F&^kNG4SZC2noGb%M*SDI^|xm zd(qSc1HzPRj}R}75puCOi(smK`xH>r4N7O|TvehmLaZ0mxN=rPg7MIx@g)W1rx|7q zY{;mO9Xm!qgeAS$Q}_R#L?)2;);$dqWPMrAG2{GTQ9M!Zb~HBNI%-NYGTK00l<8%5 zfcMaIxf9298BP|-nG;ed$jOLAVv*HfUNyk<4Ikups)P!~H8Kl`i!*V4zi1t*sH3fY z(e(OZ&EEl$Tc1hSRj4#03Q`R4aLnxoku}|3%#zYpJ=D~e{6-8Zy(iXwN{Q%&nRRqV z^^J{VKl3O(iXv+$w=}vF{k~aVIuD0iexn<~LB?iLf@H?)PeZQ1UgoLpECUV@yLkBE zp+jEaj<_UeIId9_*bGzCYEfRiaDf7oN=tDP575zVhfeg)ojdyZkM_#AOOOLqMOICT z$qv@^h8{%yjdy#$u2$eGnR)qE`>ewj&JVpyCaISYbq^-usl-X@-@M6wFEsQghX7ub zfz!UI>mk(E%H7G2m3#WeoxJW&tTcJ;7OK_)jY?$W9XdH}>SkXP>6 zyB8=2mB(3uHzz!Jjl3nNv|b>WK#gIZnJV|N-^}2YEM^ik{NH z^xoPQ651C3Bp06&XUOOCdl98*FDf4IG~37r6288^&wxAQmtXTqW=B$&0Sym^P?sQ^ zz+8*BV1RHTpTsQVPe0Q=3>i8)IKe} zU3e>>d6{=vz@m#%K<9`nMR$c~dF)Ms>sHCFj~ZuFuV`pI_3*fmz!(pgOg-VzuP%{i zACe8DuA7pu$R_3E zFzBhi`ECgbyA@?_iU=af1!bBT2Ru5Her7bmOfrg!3EK}lV;Z|9QOd3l}42&?YZ%JI`aiHY|@ zzbRe2b}g~$E9Qw3KGCr}jZ7nYJbM;2n&kW8AOoK+?2gWL6g;NAJxKJgQp&dIguc#q zy6UmU4m|7g=&NKWj}Wjaes+kkS>mLogU5qthr%Z<BJ1So-81{1$E$S#s?&bLy{ zps;-nW>LardNy$}A)QB=2&_MWGw|~Jv+e*=oC9|{A}vWyNx7z+d=tUP6Ody6>cV*N zpGd|(RfxI|LH)i-S0`!BKk?;lJ4v@|yY<322S0AQ=U|KdSk9#y&u|o) z@>STC{=~Ab1mQ_fX+9l&Wu4?7^P^o?v^Qr9((nVMV2)^O^6ziPNJ|>aWOwmaa}M^t zx92|QC_k;0oz%_d2SLK@1mJd1)IvWvtpFXbK$C*4ho}f>nP8=(G`N=i!C^)#Jw|Pl z?^@B<%F5@!ZT>yDWi#+^*p;sR(WI)fs3{87Ev)V8e#E%&9Z|3)`gL>9|y&4%@SI&9<(#~vz&X5c!SKJ z3Wxx1%H+R`;h#Qnt{u%$cb_{&k5pgddZ@~it7y=R!ETYjCCA-Zk zS6_rb^{h9Yj*IDQ+y4{phrLL~B^8SvVqr9nG zK7tv#`a?f;Am%-A@ZgK}^|Uo^SG;Ik*|=&ll^Bh1hmx{gpcfTNUuo8+ak-k}omkn} zta08&Y9^PWm!(N&dz0fEg7gQodt_$Sy;d-nZ&BaKRyMw_FKz-`fz>uA->{{$;n9+gh`F`x^PLvu7o*FGnND1P^0Oo@+NdC z#JG=fRZ_E=!-DdP2~jE5C8aqHjn^MJZw&E^+f51n!oZVlJuKRq1ZlCk;E)iVp`Rhx zf6%P&KYDbd*&I9955R9wl3EH^UF10hTqepLUL{xdAcEsoc`LEh-c~R@KR>X`Lrzee zjZ{Kge;L*p#)Ivj^r)|!-Z1K~<8?oH`nK==y}F9K zA$~Y-Y>8RAy=U2--W#r|>?-Wm+V~<8Szoe|@ zy0_;P??*Q8H+k09)=_bBTDRTs!0Ks9|K4YoVopo4>ya59W@AU@LN8;MKgEE*^rOpi|P{<1A1;+#A?O|S$t zki>tT`AhvW9^148vgI->vQv#pV-*7lbh-lh)`OI!!;ZP!eaHs2K9dg*56f$4WU5c0 z53!1@{+S&MCF{y>_t1h|=M%9bYhSn;-43_fpqW+ow^FG*;<(b*e)4umha4PMAyTHTU4TKV(m zo9pE`zlT_KenU1Mb8+D_qhO3tMfAhWuOv~qek#w!y)5KT`4{y+T(LUnb_$2pQX=R! z8@HI)A-2~eNzTncIU7>0i+=j_NwaMmv1qoUTxSrYAly-#+yF4A#i@!MY-oCAojci< z)K^P);SS-a+=u-reG+MQkNwR!4up!Hy+|cd<-29~geNG9Z+~^AO+}$#85sOebuZRB zdpH@M=6~DT`p_Ionm~+sFniJIctp&4I|O5vKsrGdr~QiNrG!q;%*^bJslkgI6^tR( z7p#@MJJ>EOs(Lj32tE|UbSv_}9@kBBKY_^?{;n{t5I|p{Zx$~@Z`$~9PeZ;$$dCS` z#uW=&!Gd?c9^vHVeB&ta4QD8}XJseg0j+Zn=Su{9r`HqA)ce3Ym5_B1 z4zBkMYJC~jdG}};n<-82yswW>8E*96NyA*z?r$k==bwSci@{yKWja;RACuT9y%}!U zNTqm*5O^iZ`Yk7az80YK;YP?GY9GkA3VBt zKf5ePCH$z+`G(%@&@_P2j6&GnWA)fC2eI8CmE4~x|upOo8o(rNMjN3{@b zq7TpqH1~ZVo1~Xw-`}U)bMH&ZHqsg35Yc9u^IV(E_s1V{4T*}C6VubLO?8vCLpU?3 zckd?4@0@Uol#`Ql7z(|kn{9CvOs(X~Kp=5-G#)rp1c@F4KQ6u8vn~5X&7)t9%E~ZR zXYQU0hR75eu7lKSs_y0wQ<*CVt$+SBI6O1mALsz zxNEqVF*6!r3@|#f>Z4?zI>wLf?+^o^@Wbp(BZS;l!4qC5B3|9iYBTaNq}esYVI}PB zcFaSr^cAZ-yT&KXR)g1Fwfrx}2=NP_&+{2oyXw}KxIfi*tZ6G`IF*s1Uu4@bVfVtz zixs`4=)r|BytT+#0-rVzs3zSTtYb(*yqND+lUO6Bmp>|WAWvpxWfeMhZ4ZaQaO2Cs zzUmNzFeP3Ak>{2}zvR+>&-|)!tuI`XSsWg;S*|=Ky)j&@c)#^EMYq6r<4@l$vT2VT zDNl`qvCA_;mgVL$n`+j*{5)c4@2Z|3x3kN&+) zcYOQ;gHas$(?$eOp}Sg>r+I&aGPzDPo$9e3$EcUksfOqo>&vE$e@F_@2bXy$Mcd{F zYg}ioH%rTOScs9`=&3<#F{QWwZT{RR>oc`?VsZBr(57 z=RsQEl_?rI<*qz*H+uJW8S%4=PIe>pvg#BzS>q_l2N-PLJ?RFh+ENM{zn8l;bg% z)&F}Y(8EINYPPS_O3R(o++wc2uC?zJBfF&P?`GlQ5??l#e?JptWaRnXF!NDAe=*Bx zYwY+^+HGGV=d*0Q^Ey0TH^z-ZB_9i+1;@^1NHwHPuUhBo)z{T*^^`6V*rrOH-)8G( zFzN;aB`$h*(d1C}KH_>o!9*UgC!rnUgkO>8>X2n_CLOK@X6p6gMHDVv+ljVdaJcPx z1>hn5zVDt1_|!g>B9Svy#GD#QHoEY4;t{D!ptoxEacQWz3Rf@&TfMB)uO#&mr?og$ zOq@J*>VSt_gYKRa7PLlp2>=WiDt8Mr3CpuESJr#(?3xX)y?vZJV|VdmyYKwn3x&le zI0L!K6e<@22;(Q>t3N2k<@G0JTj_t@#Vo-}6=-i5=`JqTsc6BYJzKUW zV-(Og2G*Y4sb+D8=~|vGw`JkNo7nMstT`ZoC(x9NljmPZqnu#Rv+g$rs0J) z?_SxATkQg99-W-+|1Q^6}++OZcfSEA}aXNfPkhs0A9lCPreU8V^HpU8_ z!n8XB)?(q}U;6vvc;EA#wfk+am-5{ASNP46D8j!J@yqI5boP8j1d==sK@Bn1gNjiI zG;x~XIvqXz?reERlHu^?=2CI@Vm=U`-x;O~Q2TPNII2MW^zrwoA*~b->ADe`p{Ju` zzs8tp)V3sy!#X@ZJ#7W)eKm&J)q-I3^6@dP3g94Fjh(UVdY|3x66Q+Wf@#wL&Z7E7xOn=FYGguE8&wSe!$Z08}T#=~(&Zgjr+k z?&juZo>L$5D9k6Q!fJWE$c2?&4CivwKHiF{vskgHsr>M{{?)79a>Qo-PY`pax)s);jj$IGs80d=onF~rz!Y^-=3aP&8ds&JfuUymk9@sVLVz@Rzb>@ZNYMtA= zt{u5@T{O$g%XGTT$2RD!W84z_mw5yQhXH9xPM2&mOo6jCu$SlcS4?Uig)BqLdIR_- zab++>`l>(NfC65Do5{Vu_+N!33O{@S*8NH5$f%~pO4|GCcN|NM^13)pwGLHgQvX_KtD_ywRd)%eTd zNYUoJ2dR#+vdZrGLND*VRvr>h^Qe3GOPapXrVmHkzfR?i&b{5S_heY@)wBBhinoz- z$R=q#Xq{LT_emQYyQiJ3OG*bi@}?J*D{l&snjWe@_?(3>?d9R_yrUu*Fo_oENSYjt zk1DUx3slVGsDW3RMDCMsmCWe`U#4QlMuLl{v2^3X@4sW0j7?~lJK_P9HhhiJ+5QAI_6rU}(wq@AIES#?vrS=zMu-a>v!@wT4Fhizgm z6$MHfZ5^V0X?BJwE&oUtR}miGoFoBR>a_3Xj~pHts0UjWnOF_pJ6)I0fGlg|n)`mE zC##Lr3Kk5Co^&na3+KowP~a?axQd1 zur1qiN?X}6ibxrqLye6MnA)0>(}=&SLzkL!Z|P3L96 zqZd{8wRJe2mFg*)dJ)pAH+W95ZvR84zdsn>M@4n~^yxqlV-c>Kk4rKH(Lo-GbDUxN zgBl|8Z{94MZkT%TkBpK!TC4i@&90VUblEn3wPR|3^sg^%ig?~)lG8t{^KXV)y2KQ=SlHAo+fM&wAfa~nbU3PM&rB^JTK!8;#_T-WIdU|9^*c+lKDQEw&1 zW{$SGs|lKhDd$rLl@l$Ni@u|-!N#gpjWj593`S{#35>lq3#7@3p!T<}8yZUM%%?^b z?WzvhZ-hjlYDfEW!gAj0!v@sXv+fPf1S-Lbr-@ku9 zKq*d~ayiDMiMwcwX&wqHN}r0ptPaC-aRmkXx<&e2w_P1wlkpZ&7u6*>t<qD;<75 ziWKM1Fvu1jcq7PK1Yeq8QgTIXMZOo0`!EU?Nw4g_0UkVn+gx3p2AzX&-Dp~AMR#9o zwj~SFpj%LC@WHib-?{buNXT#nh={zwU3Thb<*P84VM;Vz*EW=S7T!38?;96eemXZ4 z)&vEsC)HF`$}w5p$E3o!PZ>G;YK<68UtAdDh~8c9L3%B|>tFwU-0(y<)d@jCe^?7V zgK&{(1ha@^#=+;j(_o*_H0rtIDH-M!TU&o-FPFP@ivt4NzF#(ESq;>d#Kq<4Y7>5- zpioj(XFq5cIr#R3qwbZoHMujgCDB8xW9DFB#KV8Zm$!fb<`%U7mzl1YvWG=lzj4qX z9n(mI3YPnP_zd!bz%{8G8vcU*X1Skb>k=}cR+y4ORbY$}$pTXL$X|L*O!T?Lhi4oS zxx>MTxa~J00kXy?&C2Whq-SnrwsJ|@={0(UuO@BXLT&*srmk57TiY|F$cc|iL@mVP zerMw_zjgR7tzt2icZy&Db3;Lb`&UaegXlY8SovRGAlovIch_8MtAp zHuuUmoZBEYk&1VBk66FO-ler$<`xh4LnsXa!1YeBX!!}!E~MNYLSzj)DNKYyB@KU| z%-GrU#+ZAuq*hkaxVl$z0x3BY(QBNnbv}nJcufmyP=GnwP?FtU^Me?8AY0o%!@QAj&2HFGvA*2@!)b-hRoGkiPPsw z?`0`P8()m^@?<&uBep7VJZtHT>(=J|*3XazR%*tnKc>61Si1Qzkv~h%;Lc2z>y``F zuD~L9Q!SFIv(NolIU^kdLqJf_z`j1J@XMkq^Xa7h8(#4uL6iCaX#ql%;%0YOlcqQE zA(`R?MPik6rlFn|GtXA`TN^9 zxnXuhL@$?S(gNbP&mH{hSjZe!`@Hr3qC+8K5~4b6v$*kNwYRC{dcF!%_UKU0+%j>V zWN@kcHRHf){bz9YTXk9{afLF(Ka$ep38_pcWo9D_tKDS&D~)rwX}W^+S1-SMH>kY5 z-3sLaAKyTP)8x$IkYAq6j=N$S{!;E=IZ9f;$@>wzs9U6r9!f}}89|kH0!vbLKhM4@ z`E&G>o*v`uqecmujwyx6ep`!Oi`!4As#{5X(dFxt{b%^5XHSud@%ryN4TA^sf2%Te z(oAcX6ZrJLyai5~J0U4pyeR3NpA3yj8o4{A-i9e0n1Kc;sk-W=kO6pIlf%3zL8ETI%J${_4 zTprSV@4|;G9pVz+ULOV$?b|Tzq;!3{Y@dacrzW2fXP_eMbH?nsBUdP{T)SFjetL7@ zbzthUiLlz`P5SCL&T4>LI$esuh3oM#RO=bh7%e<(4Gd`i&|o$nN}*`f^Zj9Nbo4j= z24Z0qIxcRBhg*1$+$Du{2?nFvnXM<*+w0zoJ*~QFT?AncjxMR4AFv0m`#lsC6a_Qi z55qL)350m7^P^={f(0d@{4rc+?oxSSKD|-uE=!nLsc338@kfgKnpb8e=@hDqhF>18 zsU528Q%-9+ZnNC9t3_UP-|mtrvOUes#M##S)GT5{e`p0ZHg|GLoem$Y)*%5B**f*N zGn)m@K3H_$%;6P%#GQL|wed-;;FW9FM)frT|IbUzEMQq;u5dB$m*Ph?KkZ{ZPP1w1?sQGa&vM#`K7z*UKGk>cGh z(k3caOyD^cKk^m%Rop1G$nK^)SbbPFvO_UUBt=pDN-pMz*ydQ^&QJnvLn7}861 zEHJ173mQ6z=JPnlOz@wBFdZcAP9x_9dfe3LObHUR8LBUKbxqaJ{gyvfq>L^VFQi4d z9aX|aUb=41MUo75pO*H_tm6KrJ#gS*%a4S_#Jq`tQr4n^%wcnEyZhZmohh=SvFXE9 z?=5t^ZoGXcq1_{xMLX0iHqOl@IWZeZTv%ZHShy^K^8q`SydON-CS^O4wa5L77F81J zBAD^k+U|>F$F1)=6Qsn!^peDd6QHK81n;w+8Ue28P4(dHiSzzc&z5P{Mc*gQ7JbP; zb2gn%g`AcaJ#6;2A=Y<1Y?99+eSkQmi2Z<_Bp}-8gY&}?Y2sTQBj;e`LY`>R#oTi_ z5KU%qhwztLJZM?8!rAS-bQG<^76w_o+}yb-`B6zP%nvFkX4rRNR3JaN2U1X-IC4Al z8co8^)OTIvz5>4*lwoo(=#+IJ$^ib{Z1Lfqsfj7O5S&*yaBKL{d|iG$;cKV zt9-5G?n0y48N18re8Mxgvh+Nbty$}&*Z8Oy_y{tbfejGCKukSkPyXBN3@3q2_I;1v z6>rX6m!4R>-@itKuGgB#bKNUZk!S7(v_F%BDiZaIFO%@kmB)bXfkNRz&#>&4-rSf% z?HH-1sY!=hzHrnCZ+^U~2r^Wgq{E-o6Uy(M!~^j$P$|W2?GtOEeZl#{{tF5PMXTtBvXT-TH+Pj_=TQxHbuKP0t0GIZ zqvL4Z#JOqzkg?To`~5C=KUulM%$ zqFzAP+vqxJI0HI`;<>J;80Xy-3%bX^Ix=s0-KD?e> zu$vUy3YKSuMbXUli_DKm(sKm}n4>`g8u{#hiV~o>angs>Tvr2s$5ok~xslnr-ZtBe zOs`mfW}W+g2`Jb0h@lHTtWmEU%=!}IBkxoMq`Ax}9JbCrIhY$_783Cb0M9r*PiHWC zX99tTh;?V*Yxag#0(g-`E*gXQrax*{=)hMeydtF2;>S%P#qWc^?u$5G*!6SJnn~0` z1<@7DJ^X;;UP?vsVmHFgdpXtk-<^NLPXR^525?%q){O^6Ceu%6L{j13&RlsvTrRU~^Q!u`x_--gU*QWB2%kfEbr!OdhC+MqYj5+$fqT&GjAP`k8S5g6}Lx3)pauaDCQi0zq z(uk7acyd$jqobyGp~dJ9Y9SaI!qZg~?8U}qq7-T5%3|xCeQ17t@tRasmz*naK>`?l z3x#iDok&sbZIVunL;)>OR0S7-tgo~IQVmgln&y|v@-tjyA*FHlyKw$CJ}dtxq*cni|xF) zrNqhC;@FE)PaI6kTnP13gxuRQm~W}|C#A1p zn!S$(Rq(@_5g!xvvjkY-!&L~cay4@LhLt|?8F zJr86|xxLa>eI?YrZFgO|w-p*c{j7Z$nWPF}GGI=yYI_MtXe&&&YH)tMMU7hWIjO$w zIkEhH&>T9PU!|&X>d5(tUuPf)CC!lhQJ18hl%dOKnBs&S7h7O%Jd9Ke?@-?@-=oCi zrsg)~rr$%bqE`5{qFkchBPz2`hCi_C2-*FcLTB>!qtAsTSw^N(JixQqs4Un3E=j^~ zySVuuJ`Cv2(n}FhBX0@SAU2=S^=j&%-e{QRptvb?$X{}H7s=ZLhAD{rE4QQ+UNS*Q zxO}sr|H;;KD+#*3>JyauJPAnhAuE(dC&1V9cw@)8r&7!^mZY18M4zDtXdkWb1u^uYYpg#RUZX zP@8wWCQ)~A7r7x4X34wDx4GF~^LzQgM{J$4LAxm!Fr#^EsOm(d?Mg-*d*yL9Hl&$K zLRKn!LuZ8AJgMetu!bJDsLe)zaI8iTFJnn-%hjYK~Z zMEmjMPb_`=PF-$C18l=6hDb``bBLw!>2Hyfs8OW>Mg+g20;+-})`5DHGzbJYh8RUi ze!wlr+0>n-WETV*@(jeOZoBr*3PC(4wiLH=8do2LseFoigaz(sO;90fKYlE>Pip>^ znl?azS!i7v(T?f3lNoYMQ{);zi?ewJ1Pt$Hx3kR8#~H54rD(4ew0|fy-j%aq*cC0( z_o3KJ&<)KZRS;Od1HwVNoR<`Hjzvu$st#$rKZKb2NI%(H3~a#C=yT-Z5|FlF<8(QW z^8<1a!wiRC&P$yN$WS{u3rb9c&xB6aP_doi3xbo^!}zAa@xc1;(H#1}WP7+|&~*Sy z;0!hGZx-37(XPhB%NvR|P=ljL>>^+U15CG(Wx1nF>#kMIa-#W@TM1r*34=%G?yL)s z>^Lhoj^b$Tf*`3lT8>>Nd1dD09JFFTaamDu2Hy=Szx}vzHf|7pj_fj6x4Jo;YRd5_ zZeFy<3x(^%bS|Yzq;xn|=rdX;o5u3%lL zk35dIoXznzbumzyJk7EDt$IpfSec4B6F>#r>t=JAAFK^ww7h`a;k$UsR+8?DpZnj?goCT zbms_$U*k~D{xx+EBaa1_ft(`pi*^&qu*g>2fREh%%C-VBzls=y>kRW2fk>`pl>Vr* zs38_xP`b3POd4Xh)aA4f2(wsyHv~~PGL}6s64@w%m7l-%&`dqcA!*`Sh^9=>tC8M8 z$JxgZ3qB>hzR!xZeT2>vKWkYiVx#PZ(DtEa9GKxl3Zc+{eudP{O#BQ!t^9rZDcYv? znAL(?A|@v0Di_`BT^$vC#x@hr9>e83AiJ*^zhiyE($y4<_5x*iWGJH%@hSry z-4}ARDef7h(&NHk6}hWErxPFhA+Fsimv@at{r6VoqeH5~Hu_!oW^XNt#!A6$9foMa zl$@(ME!{nWi<5)9%86_mdo0I#rSa%S3Wh~q^Lc}Id({s91v`YB-%|31kR{PKMlkR zGLu8$mw!OsAjPEZ1oJsGfoUZD9-X!79?RDt!cbwgpyz;vXGBHhQwsu`n%<#_CEA&NXYFly&Onl{K(WIyi>`uDw4&y1NFp86FL0Q7Q+L$D(|&UDL`ZdKpZ^vT zdhu-uIW#yPf0|&@mlKxlMw?~YN~&&3Q+5 zP*9K<>NVoikf5L|`uhJ4REIPnncs`ZJvT0nHm0i@8pjz;liN>`Ym^=cx&t!wS4P=& zC`=3;dMRAy!n$E_T?^lYLT!fs1$Kn5mqAK{)S~+SmuIxP3zOBUgav~?Z}(oFy;F^w z>5TI6iQr25VM3%?NxI+A~an<mjw!+jD_sNoDKayok7f=Qde;qQF7@vJCkNeMO z7Cli=vhe&iJ1B;lNz8iS#dY$NiXJE;s!)8o@H<{7rIvSBgu-eAc=acKW?_~#Zao)P zU27W~MOD>>1R+m_i((hj1+VC2E7{Z1(2yUqc7FVvF?Z5M*TA6G)L&|-!JO0*1&aYU zQi3&>l=d01Bl}rEnjpFtAoNE=%PW@>Mghq$ze9xB*uD){gy4L=!TCZnz`95uYe~9E z8DW(m)X)zQ_t`Re?46a*t@nuw%{p#nV}q(8M|D1<#)rfEh{mm=;w^c=#FhMPRY#qw zfZxY{d+F)nLB{Ug`~A2knJHOP#SIva_#^n_ZNvglRC#~)HCjMVS@p<}+PzL(4BBGgCDmY^xh(N)8G%S1kHfY1BUx# z>bJYc|J|{k`$%pJtCKe;!aqiQ4*0>4LXf1kHRsHD8lZoZOeC)RW?OC@}a<$&JY z+#L2HE8NTFqZO6y?6PjQ$3VvT-**xlisdXy-FfTjr#MpWvpk8@+vU3D4537PiCg~Jpx@_XHF?G#oCd=$UpvZlHzO!d^;3u~KAkIrR{R#CEOC%WB>d>6~ zD*GpC@^Ekz!7hTVZ@+`%{lipmABY^j^Xq-@Z?$uUN!wtG`kKU1qMkTuTtc9uq<~Mt zz+BzA6VhFxSqngu97f|pZpzBaR=ZK#^A_PB=snnIY=b_HwP?L_wi)uNc|RXXRT9FR zR&pW0!V~h0IIv|_x(ZMX4luqf#14bM*ignC8on@z!mfIl1X*fn-^kJ{?Be7J{yDf- z6w4-He)n%WE5f<<9#Q7u{Vg#hH8N?>tAl$PR$K-|S|Ptr>O9~#z%4!D_FcUg?Xo`O z?;4BdSLyCca1s*#zm*4nQ=QZz=Zu}`8^PX32XnT|mCL&!KK|6=X+9nO1M7%L<@?CN z)B^MXU-8}hY8*P76=4PfCnVXP%zkc^pgnzW@1y)$egR*kHy=T94fZK3wb-DvMTu~6 zkEmhq-l!u-Qkx+?{NuV|3i==RQAWfW3KC+AqgfL!5rRN)Ra3JZ5mlDtvKm<4u2u6+ z*e62D>7K24Cs|Z#FXfaTvGu-4&25_pn3DV=ZB=i2`@-@4Y|Ms?Mkt@I{`XCH&kVl# zWV6r<`w@~Af|YK6EYqpvR5azIV_0tZSK}_**k-eYQ=P5S=ScAKkITvWxs8K^Ik=Ns zv&=(`eA3|Qpvj{Zd(fgl?$mUA+wKKV78eJN8&kk>GLTV`LN3Y07D!HKo?SosMC~sw z6Nkg?QKNq`%Kw1m>J8sDz@GO$U5pXi)pUVE?C?d-z~-dz*rB02LZQ!=XUW~ur;?B! z6MA3l%K0zS1b8pQRESJFi5{pKI_E*VE%-d|f_zQwgpQw=`N(YHwH_>GqO)$-Hpj3aoM%c&&hXhZr#O z5csvbm(6c~y2UL5lB%6_H#M2@(z4?%Nfx-CL+SBtuqC32+=5ig zWVc`Vu03V4-7r_6=pVozzH695_8nd4Es04<5^q96%rW?x?f*3O-tk!X{rk94RwXOS zrbs192_+<>NTRaJC?nZqXH_IbJ4GmxBqOqCAxR~~sbpto@2v0f>b^d|$M3rTxvwjo z&i8md$MHOl<2kl*%&}wF7)wU0L+(iR{C=0;422$}?j`n*(?CY-~UsvkH$ zxA&$87}(EliOd|1C;eJ4F>FeoN=4xneu zF6a;vLp8q0mMzaE$1II2xK2opizFTircz7L$>Z}r>N8*1B826O#h~^%2Q;q{|A&r_ z>hf40es*FPHdP@<`0veZ)PQk`_NZyt&4{kSMz%OsQ{7A-q=$3+q8~66h42}yzu(j; zlUPPuR>HHo8KziEC2wHibQGH`yo>#K$9iKzB@0_6elf3}CYGeYAs2ir4uj)JyIx+f z>JIZ^ayg!*v7w{nk^)Ld`0(2cABAZtqr?s2i{KY>UO!aPhB`lUciJG*^LeYPL8x zjvKGdRV;z$pK-e)HXY*i6Gns0^zHC?PtWVm4}*b(u94ApIT?-ovuId+)0(W096N4> zGE-uwI!Zt}I3tUQq-DK7S~ub5TWV^)`}dr@s%*SyEUGdnJtDxY$e;R)?Fqg!o1i58 z-tStlq*O7y4Bf*s-Q+=$lIQ110Y_8^Qg2jG-jJM~KrMY<>TwipU3_2A9NQ}9<28}T zL??~X(v2d^Aq0u_>fKao(*f~rJ*)xK4}K&n0I3}(CiS*K2UeBXb5e|haFDK#Z6*NBj;PyX%X*Ge=SVh+DUkoIuj^KGy!zE@#7{53|HJC#K*vUDsg_5r{6zd#az1ni$tjw=Yd%APY1C+@s(*K;M5d z4aDuQM1R-w75#!Z6k^8HCIX*qP~yqG0fyL4k%9Rg3RSJ0(5S0Um4G%t#ctY`=|e<6 z&|*_zYstX4)1MK%($W@|UR$H=P_R~2zGX4ZO#=JsqebRMfiqvUiX?LAXngN$Sl&J> zq5kXy3;~V-P#|I^MD~4A-NqpYuDvS*T7Hs=iOJ6&^b-A=nwnvo?7vE25C)gAbwpMC z?ECFTozrF^B`ps^G|b8De!dMs8Y(R={Vz*k^fip=g@n>F!4tQ6`i;+~wRBkFBC@$8 zu&1W*yXYC_RpX#SN+QL}e&hVeUMz{WW93b!)22IKnm&!aGfa$(<55SPqcgLO zq}0%0f`JcJC^+4?pr50N5#Oo$Wm8;SR4uE&t1ZW$E>q+%%JO6yG31+>eRTLW{Y7 zQ(E8JBA%Ma`!hZ$jz>W``GlKC^P&WVbPGU|v`%>*XDT<4rf}s=W2k_3`b?uyF*G7b zc(a9#9}qF_-rw@2ziS=0%1ING-rO$D&^qP~OhMoG(OOcK(x^e@m#F;gDc(Q~V(Vd) zepK}`2G$S}M-byb2qHjQ8_5Q!!jI8lCh!euT!FdSh7;Y1!u+fL(%6QpnOneN9*Z9~ zZ{1pzo+^f$auU__S5H;Hd2P zB94bM(q*z{TQg3l>fHha{t5gEta_}(J^&0=A3ra*8=Ceeyb74PYyb>;a(V>&;hUSA zMGz5{QyL{`uTu+|i2d~Ay(6HBN!X7<56Sfkb5Ad6r<8(ftwg+sOwwn=tbvOsPX?_QqHdXNDI)I^l9^r+^ZffBZ z4ikUz=v+V!I-@`taD8Mm*z5kNEqPB^q&RC=N-TlcJR2gnZS5eDIGu#>0!xP~ zP@z9!bmS}5taaoF1JXQvHZ|ulYfZj9Jtz$juG2P1*%{Nwo*IixQ(CQq2G0-5&U}6V z9w_-P_^Tow2MeM?*tgz|j;?{vpX%Gg?~G4E*@^m;`eYvfbx`kg=O6QW)2yiE@zII? zni z4x(UE1J2WfiPYsbhum5geI+wNaa7=6dB_dTfZtcy9gErIq}YkV>(lNombo{i^Bvc+ z>#og1yLbq8v_+Ac}K zQVBNuebJNI6ncY1HYkzmtki>iwRP>uAf&%v8Zhc7TZjgsE2@ zOPBPdD2b<79D=XLv01aOr^bLh?5xBs8eJz9oegsfKB{{UnT1JmZ8=(O?y&Z4r(=Gx z4IJdwfm+0-|0wcyS4sL%h=+NrPZYhAO50`RTie6R%EVj_Ho1)0<#Isz@ZkZh|HB9& zvK075C<2ePZA;E@+*>325~EZJJu!|^YGdc{2QGYLyUgH0NmjnVxsCpmAT_v{?}txt ze=p#pV*?iwXj;u%qf{~})9k2{ZDl?8{gA<1rI<2%2+7YIj;vZa;%o|Dg6U3@$Tp!u*Is>_B$F-Bw*XEEhcbUy$-me zb;1jD*_gv$7T9dEja-CEu=2p}&-KF>s22k2xWnm+O=^M7z)L#Ul-|=xI(sB!z5Eyz zxE1<1rKc&VVp%~lYTw2Y$`r1aMZHmTX(nVTVDnfA2oIqc-AOrjsnqpT?f?}6ycY*} z55)n^_O5fO{dV=wLzr07_5d1MH1-*A-GA$e6L>EHn^oSH77%Zn(^Ayy0G`A1XK`no zI*?bx&eAT;qy{Fy0vSzsq~Kcs z+-mtTxX)P_YFLJ=UE26aPkr~&!Cg!l;I2M*d-WEEgHjs? zAOpF(y}|9!h{pIdeTTi1L0UsY!=_zk0nIHvbO>BHsMjvVHE+HpU%y>idfGVK$QO0< zNZGl@0`}>Qjh?(2K#-&y+vK4Qnh$U|vrWW+t|}IYkQm?i2J3|qD&m=2ZDF&%;zkOA zjRn5y9Nbn?@^^5D-98@FAef$N4qh43cz*lr{PCAqT&Iz|1ppDE&XE!a?_v{|hv^j+ zDz5uuQf7RL*A*?F>=YUMd{qr~5@Z>-mqu5X)8ROOXm+2?ck2?3#OJ-n>cL=KisyhMgPqulR3f*Al$Q zQ8wg~UzBc?>TWoLu)xo>ca{2<9Mn#^dR@T5P`3(3)}Uyc)g%lz5TTv|6}u}|i_(xf zcuiyY#&9)sgDThZ3i6X47#CjGOGs`l{hc`^tf0`A=%5Zn5N^(Dx;vV|`~}mw+AgBVaJ)$Xe1n2Q)4Ohc=&=Jn^kU0 zciEeEaro2s_xA_b%4Gfc4BhS#4UM}hUZG)OcWQbh__|k3m;Y}pPS%k)x#43tI5^C5 zTWV?C+`BQJHfTK(zkNHUg`vz$1mn2KOZ8Hp zv{T!+VhdRSz*Ax|Sht<#5Y@!{HIiePM^iC}=_CIU?XmSn#*@azTBw|e+Bx~uL}B|< z1c@N5qAU!*KEX`{qxVP_Gn~P}x~(fG#5lC!^N|IZ00&Pcg<%!-N7LmAF8CmJa+qME zz{M#kknr;54Llkkpug7uAf(a0;yy%|S zW+@2<5N+AYeB26kQJ;-9xb54vb)rA`x&oXAYj4V|vd(22QL8(|q9j4?!y2h=xF2F1 zOm!hLPW|76Am#4|8_j>d#PO zF@#5n47^_X?F}WCfW?LO6&Q5Hf+yys;zo~TXvx7v&!c6(I97{)i; zSd<)##4X0v9)>(wH4N{&8hj8y4#2co<|w#^1|Xm$#-tlF5EmVLYmW zEDY4W7e1=8dp_y22_g!-pfNTEn%e|mm>yJx=`Un0xUyKkzZXlnOsK#cQp6QPsIuQ| zVFfp(2tGcDl_BUMCcdB4*1?;HI-T%80lrI;AICs{gG?B%Y#UAhd8;yE2v6y8nIBaR z^W=Y%Maeowtx!KSok;}#5De4_(;N!G$1y_+RE7o{5>XPMaL?J`$;vVf6bM{zvIIHc zSn|2O+;EH029Ak43K8B>96=1KEkO66B4``x1jL*}Xa@88VlP*OZ^q+OgWwR%xI0(` z`!urJA~Q^oH&)f4M80ZO2Ya!JTlp~KgAK_vpdV8s{m z6+$xtkmN>h(psTh`GM`aixQ!al?3N3OXj-P#Cmoy-c(E2bAJ&C;fd`Js41KfnxIU2 z<2KY3@M;#(|06c(0h7mc@7w%5sw2dquit4vBcdaL0z|S~_>E*Wx-&a$zC4&RDC9lu zk-SN_oY*k%yWMAU9a1>QzxPSNcZ#+4;4H6uAs^(kK+Xn$x=I;#e2}+k$=Qkc56TO@ ziZ4cH5W^?~KR~fw#_DQbInhHngyKedn830ijkz?$=7E++x>sNr2+FK75NkQ*6LZoS zadC0G>m;Dh)83-K*|6;&N7X8 z%ns;>32_s>8Wp*xt=1L`i}UwQ3qSmI8zP4ng$gDkEB==~`1RdlMx7Vk^kjp3GDe?@ zUlqJj_~Ksv?Wmo45f5duUSIs4({jo@tM=DphcCyz>^!3uU4LwiDxX%vS?Bg|_bXPP zIHaVbbX%*l%+POk{q(U_BX(VV^PW>TvX;J%&K_(R2+nP9pPh6o%0VdzchfNx1g&*J z8i6c36OeB0+KLQm9tEbiv~4biSQSM zM3*2gbKS5;v;+@n?(bhVKE8DU@nZMkU6FN5L1WaR^~jlwfScFoF?D@cSs4VQHM)eW zL`tT^L2%XqS!D)BGq#e7Tk;IzQXAJs!2#|G0)S>1gL5$Ay;=S9qXwyZr?zi^RuZw_ zl)IO{zLN%605vRa$hWW3cF|CQonQ zKu$7X)Ym4(M*DBEDB~(6ovVGzN>Q}Un6CWgB2=!oH&{HL;$J8zWvIB+i$#w;a#H7{ zvyzdK z(KUMjzgQ9YPE)d21E(-kYYT$GS{)|Hu#P~p?(d_#gL7A<=ffB~tzh*mb3t`GzYKM~t0 zT){U8kRn1y5+@g?C2ehO^}9UG=6+_Jqs2`5jUUAlqUH}Z7Vp1QCf3>V^D8{LS|pDP zL*~N7{%zeQc-RfJS^@63M~8ooUJN|fTI$-OH8Pvj|O2oqr; z85v4loSztq6@+^8HwMqy8HJ4Y1{fRBA0wy6{h2^U6WJn;v^+T(w{jLK@W3wwM_$Dp zp~@;MfqOqmzt>minSYg*W(7AF><5wYs=in^&`|XE`$?_qlBSZ91yvP*u8%?qP6eDY zJQlNA_~;=nud5HiA>EFpkgR2}?OC;p(bWEhOh%j8t~}x|(-*{L^6!{>!2DKyz!mos z)LNaC9?G9aY}(M(*QfmM|2q7#q=eWD#s5Q^lwvNpC!%qrmF&uF)PSoL8P5#?@tXuEs?7VX8)_QE{gnkW) zuw`l^cBC1Cmx(n;pr37u6zz2-o1#w!1$}??MgP~={&&}$y=RRv#)i)Z*teP0eU+7! zyFvZ6v9;a3ajgptlaP@}z&gJDQ<6Xln(cB{9_ZZ}Yvg7NIF(S>KOSbfcooM8u|)_U zfYL1E(A*$sU+xq2N&r@KE+(wX%-BesH`P zI-w32*;iYARw3nxzzAoT~TPhkd9gf60F>J@-!STVVxQ~WbjxmUZt!#^ro9vr1j0gIsyTT+V zTG~IoOkwGYPcQlV>5a()lk^K1(`%WyG#Q~0RE3@kFp;DF@F^XgLm+5CtcIY{{pSzS zE~0yZQFq7iux~?{4s5h_Pn=LfZ3Ui*#r#AM`qI~WJK+9^_}?`&6joSx0EP+xBk-A| z<=F2Qin-Qjt37Zrn2=oc#C48imgXe8J`R z^(n5%5g`(JIjNfM_AP^9BI_{ZTCi`yyxAc)3lK5hG%tAc@oiSLPtD1BhyxA&(qU9z zU~-CLM~jEYtkvc9_o`Ktm9Hk82m&7$T@-n4qoe+ZCy4?KB30*s-kudoV`Jx^KQJ>~ z1MfxL0TT}%Si*Gl^nN%<(V?E?x+gkb8??sR%|ks=7u@4t7IQ8h9x$WOKCUdJ61^$| zl}?eNv2iNi*4)I?TYRLGUTVIpal~q@qUndLe1t0e#SJq9k;gi^UQ(bJ`1$pLP7{I6E+rEuXztf0ym=@=Hgbm?({p8RzVlMh;Z@a@pAO#ii6) zyPoC$!GUk1KE|bdBZt8ZYE)gIzDwz93G>-+KW0~dq%;OmpSh`KJ_fSg2NVdBJB0(S zVWp69p?2Kim(~Aq0r;!zuf6;B>OminwQ>+SuN_Y^$1T<}iE3)Xn}{F;@B!n3i;y{L zVY-RBrlPeq1~3=g8II*P8wkh8qpTae-MH{~df?hm>n*u?8x`+`DEcq%p43m<@4 zaR)AlP0lU6F}Eu%a)?aYs;w~1L3iy6hThA?mTNAAhVG(J?qzyQJC;nbX*Q*ztv256 zV=Q4L9Ef=0u%7$kRngTZOeUAP!tq1Yx3-A9Su9^au!;S! zg_)q6w6ssQ=Wcaoffq$Zyg^|bmiNCOwJ5&F4pBd{o01Y$-R|8Wc93;9de2{Ub$7li z;5&duLM-$>hA=$#=4b@~bnV6-5V=i3EcTzOs)FhHJ(aS}-d#P%8J2il=l*d0_VAFvp8GI?t3w2ud2?G+RqkCX~yCvxe@iBH0WA= zWiVCaOH+g>!gW6%F}^Xk+1?p*;L_mKMdYT?{aP9i07j7`Q>9p@2F=ou8rjL%Q5E!aj`@ z%4D6oKxcP%IRcAgO}iFKARho5B!Li=0Bsr-ia%JnS)xCNrE~z&3L-IQM{BL(g{JpG zi+(@%jtIy<8q-YQwA)E=Gaa)<>(?0hUH06#=?Km4FZ~h$m!Lm%a^9H^gK5mXdA|5! zDh$I8C}$j7lq3TC^Qi4`uN{q}Xhk=pTIL>Gg*n;vK+LFNMd)zWI!eNQ#)$cX7ww_x%H5uwgXqo|~oF+gP)eP<3U1hG@Jf{f@2@|`+NexF+7=+Sl zuquZ8bryP@-y0h*GUFX4rwE8q_vmQ*NQ7xp!yD|00>L>sJv}KS<2LHw6FNFqOwy~` z+v8yTvd6Xm1U8MObMrUaNMn5_DEimn68qL@Cx-3`FLF2`SkFC^n>k%L`ZCwRU~D+| zbxU!y^R0tTos#Rw`no5+DoagX=J6d9Wgclh_0l!=D(p1zPK)@yP== za~fbJuY{`X79Z+7>Y=rChua}(heOAN+KTXwKo;@8af`2w3h*&Jrpd zCVVO;bBzuy@HoDWJG%!17;sJwp_)Nq29eTL)N*iGa`22+OSpp|0=On1E=~Z7iVnMr z+vbzBv-~I$CAFX8GhZ=DI*z(;(uhWh!0znqOzawP6ad#$qq--S2Z%}6-YH>f0Plm$ zv)Mv{)}z#)UyFT;IFNsEEL8nX5yH`|I5xEYAm4R`kid^7tzE1mKlci%~;-@~f2UQReG$L&q zl=#TuBC8r3H&7v6VkbB-w+BydE8lQW#yuefjvlOr%TY`4I{cmzQ^oMw)!m(+LqMSR z)#((AI~qbNS%K-u*j6d>`X^2V98c;-x4W+QVYWJ`4M`YbtzN_V)?oS_=s!ez^x>Ak zEfk#I#U+)LQ?}Z;uUGU^dQc&SH!Th}rSbqQc*w5H;9R&WV}Pw>gqN;^XY1Ca%b+kr zzU)@GqHigrval?5aZK}cYA;gPm&{@KAa$JYDFj56J>-u4UGS@mEh{g-2K$H;Cr*45 z9!=0qMsZRO5A2jiZ}lL_aJ6eVKDb%oy`oi>m7svAf&TyGKBBi!z|!>bb!Hw$b1pO` z?k+CeoP2y=AU?yhyR+vIgL4C!i-PLV?QU#LVzB)~`@(_(bVbl5l0i5^%Tr(fXD=Ix zYy&93O3!9V|Kl5YvMnQ zA>IAZdhy1g>CG4i!Rw>A+uKVhFmHpl{J6@s7R<@MH#P0koXcz(F00HgA!)u*#?(>_~enV%aJ&;+Us>cZ`R zcYSq+sm9!(r)$~yViU#vK+8d-KnIuDw?-(c5_MA3(@#%~QhUL-$}KX+{vz0)cISZN zhgq&U>LX;$>`_Us>?m~gNFJY1bT-4N3<9#^zITfaYVq}hgMr!ymC7mQJE1_uXE zdE)KEqd5n|zKH1(4DNuJVQ<*%L)x48mUycog!y5viee`V%zK;|MkXd3%A_6|Aq52m zQubiCAMn(2aYJ{@N?&0QJy1>E=nj+Uflz`m8|Nw|YCdMTDrBuBKM z_1Pxei zRGK!O{KuV`JpdjA&R*AY-nj?o2M+#ZITl8FXs_NH4QtsCA$6laO1XX1=mGbmq=o^}RjXEIz@v_!M!{=oOdja)&4C3& z>2VVKWW@r9;lc_sId&3#AEAiveS8eeTo!)*uNa_$7`+n`x3aRblrumkw=u7PLM{9{3hcub1590*UW6GOR9oZY3q9LIK80ND9Djv2pIw z?~knbbTwC3*Lw^0*+xLa)zT?@0I%=(df7$;;|{!F=w6l!)S4ZFARwQtjJxM2!FTD( zrpvGRh8$B3;xf+1*r|FQekb1ydFln(j9x0%XRH)sHyIfjE*Nnb7&Jd9T|Bj&Y>0tU z2)(<@c_twT3YGh_1iFj}1E<5Q7c~>=h%pPh~5-x+A_tbK6$R00I-uMHc- zTSkBfqdkCu;1eVzq7jFZquuO2+#%#X)%(wN1|u>9z5je>yHg!E9`r`YQt_hoKn#+Y zk&#h%3B^D#h9}tn7*Qvzssj_18tEOHe!#(i`0h30Ny*<9`u=XCrdX)t`-#M!u9%hC zgsm3?+RO|uKI_un;In__WL3{|3cFtV1-^99B_2u`Z{PZyr7MS>hv#xWLsdH@7`A-l z8xM4Sq$o;rf8oHIY;t3roHC{?gfJ{+WF(vXFwl@&-iYnSNkEo?inLD5l)rxuhEgBm zlo_B(q6n2~^St)+rx83Z`xy)))!##xD(AN-i^WtRfYV{qle_Vg%@}bsg$LEWJ-+ zSWCV<&BMEQT^zB?#>*@+Fnz}Mx>2fM9Tr&wfNphukP*mYpROIJ0I6c(JCQ4ZPJoUj z8Rm=*4r)MZjHaY$y6FrW%!y#d#VgxRzBr+;UxkW`Z}O`IJxBDH571*Z!v_=OQS9gJ z93F0!Sfh6~Ht8)H(vSSA#&|k8Elm{M9OM}w?8BaEFv#4#9rkMnvS4(C(iIcWORGMk z9>yZ3Z`&v15JlHSNGS-c=&0(TT78Q1 zq3;t5JO~{VSzeGC`(O6roW$>04oXVeiXTZtc&NV=#FF|IJ2f#h1@n)07Bzc!Pfw%W zC2a&_?6n01cn@{RWR>Fb9rVs7e)E>gb5)9LV1L192j)|MYX#;b)yj#J7XT!pMF(gI zms5Fpc?WCe)0(h^vqByK=mxqX{Jemm5rAJ>?rMK;uNnsAc}|@)Scom{0+OYcpWQ78&M~d!$ zHWHv`%NCX68bTHJd}W*xXFVN#eSL!&&G;<8Qd(d;Bz>>C0+V69v61{c(6mFGua{n8e~umi93J)4vZHAQ1#@=lQg5w6RCQH%1sI_l;OlgN7id%3$D-pfp43? zo#J#YWu=c3Sunh<=VALR>B3bWzQHsKVB=_S2T0C$r`Ii?; zS9p-RV!a2qp=^3AcS#gJV!Mz(y2&{Pb#R?p9*63)v@~YC6xc!VNXC($csavh2?c%5 zMINAw*@vFp9x_g-YiPKKCK5gZ=q)awOIasTZnsS}DXb&7a`ge~d(>P{ISqXCHwU;E zaF%z|i7MC`4W!)Z}|laQrr2kV(bQ>p`{(bRxN;K6ReouLifY7}9ssAzU=8#>n z{3}z^v+6n=sN-dQ=63DSdo2h@Z796v3M5i+>o1@|({w@|;1#hctJZhGx zmN_<+M*Bs%MU6z=;$egvAh#G*)b9ym873~BoA$ekp&@gQL+1jQ75m23PFSf&9EnjH z5p=|J%nZ#1S(#0g^jHIkTk^J=GC1vxq6B&9mm9@6b$_rm7>$6#=~Q*{VEUVGTLB;K zdl*!cZB^w%kan}Qw{~P@L|>`ig)ECtX53) zIeJ#M_+-aY0z8b3jmZs0PgT!@tF1>`^cv1y5kHMOetc}q?C0ByJ4;4Rog^*!*RP`J zI>FW>#6!F!>bPD+6+X>};++^kg_nw-+K%2Fcq+MGY)Ic{VE$SJ7knLaS!gvt=IBZ* z94Y91vxLEp1=@gcPT?G*RN`HV+)J!L|ES6$=0 z(O$^Nz+gR*4ZI8c{chkWgFSHrDkpNugWm^Uq--de{CWTYZpX+7fnCXfGX2bi)UI+I zOz=K(G&bxYorB`SkJ=;0kGubB8gK=3XS~_`R`!4&qVT=QNOg>`(9L7b1H`~b5X&KG zCOj4UT{a#+dg?Non+?$8emftT^99;QFd#d+ES~_kC>)%mq@D?Gb-3{TrH;N2JRKN; z(7GZDtIp&Dx40I(Uf1wUNbtLBu?M~PZF-7X%lt^;sH|&03qU4R!0ZA7{*?jln}P5P z_>3Kju5Cf&(PJ=6-xfL%8<_rZ&$_uJz~gwq;!R8a>K6i8hE444cjB26lb&@npV5e< z%SAOr!N^V)*&mz)40qU2)Fa@?p>Mx7e=ah)(FWB!jvm<$YAAVBeEatEsQDwt`UVF- zL5_r*a5hc!$Ir(tXJ^dbt>TnJm2Z~|M1ojoUMo6=~=cZxhxD~3BYTwl4WGtek!;B0A zpw`x6<5XjJLwE`T+a`t*5GU?in89BLwGnUNi%PZGK)eqIRqIW)X)oc6+MxOP1+sg6 zOCU>b47w})V1r;0%6oK$#8K$ybO|Oyq~4X1l0tj}@`L$Qt5_h$(dW;f_w2Yr@8s@Y z(m9))lCoJur1A}?;nAZZ-Mi5V!fNXS32Nfv&(ZXwoIu5e!5Uf+(ak63I`eb;@3 zK9oiI?3b-^KJT#xU~psIm9T{UM6>!VnBOKcL0W6|8z2t%(9Giv!RxgyLOrE1zF~l9 zK5s-sl#C2>a&g_mLJLGx)KypoNQ^iinbRS>lPCt03{NkwbGtU;3LU;X;1h_mAGizH zrlEW@6cb8H+MMH#CvAq6G$Xs{+XXpI#rq)oqZ)!cUztO$dDahZ-ji@C#p_)- z_|pnqKG2stAS5BVgDF+iy*u0J3u%C6XRBIT7GAs`Ahja2;Anx-tQx!VgvQINH6lqN ziaE)zOvHbe*%MDchWm@fMK{zQY>eK);txQeX!-9DYG~u*QbH9myk#b*IeL586TsBI1$DC$G<|SANgLKHZ$jyA|dB)7?Bk70=}W_`nE23IkfoH<%NV z)(A|4+F!2s_>WaE$`Y{s_~H zV9;TeI&x@ObvM;(F$Fn*c!Ypew^6|- z&%AB`MQs!i0xB-a18d>NyldCk!&xm5^x>xoGzGl7v)5|qwf>E5;B%y_r*}PG3}zN( ztR;Azfk1q6+iB#3Jxst34d9Z7(2c}FmUqHV!r9&ZLv8Ko!&|C494G=S?}}*lLydsu zE^nYlD2OFK!8f-J&IQB!0!nAK@^z}smVTf;6+)UyZ$_FJPH^je>xru1N>f`EeI*i&4y{#q&6N{J%0vDS|i3`u~4Q)jo3 zjcy9hJI!t)7(ha z%D-3*_N>Qd#hq6>JDLf z`NdZWUN}`&2(J+>+1c6P$ff_7*@2{n7oU}nuhru6t5e(Y3v@Vw@Lq%43w?SI-ZoST z(%Wz7Vu%RH!4QuKS#fbHNl^r_;HOU-&{usXe}EP40BKOLgE{PZGT{_zW&*Fc4(MV= z;sx$3_BE&7^kfuLVF$SUvxmXp^8AJGmjfE%JUs^Y=-n6Vw+!r7zHvYoq{<&R0M7^bcnr`i6sXS&3bI~unbJ3CGJ!(aqPjXrO{B0w*ERjKo*(LDPg@RZ|;BkL*73(+)6><^pA9*llz+ zXG66(PBjx6FDCG-y3>{_13M)02nqqhfU0*eaiV33@V$e)v7{qCo z?=-7F?|Ukc_WNO_2g{RTn-y#|35wgcUDT0^ilO?b%Ar$n0&xJ;-AZ4;of^MJR^zMf zUT$cwYQR^GvWz1nB&<+!3qNIC(^Ef<(+ZKXGma#^d_9mj>9!OVU`i8oC&Dz^cd()o25PQ997Rv%bv6Br?8J3{XU*>wOZ&NB8lC4{Te(4y61C9g#h7z#Ay4_Wdsm! z#}|x3;ZH^m!!6XZsNl*0j!H&yF$t**t;$%@U9qC2yNBw3G%ufZe zbaA2<$%H-;UV)^exL2DmFZ@D`31e_!xE#W9$SPXVO&Tz_|6;cQIObz7I5n2PIQo@? zQqnH?6bKcekYniT=y@5~0Ks5A+Ps1hJ_^1l2B^7@%ixE;^t^abC&!93x-A%%}a&eJD;dC)msR|DG%&I!5T164lF&qJH zo&i1Mj9W#?(##nUCu$-kO9#egfCfQ2-MixBZ2U^X6U1jMpNbnQ5eLwE3pKG6Xxe}P z1^Lt%gti1^dsEq_jy0}sw%W_R6bC%1&2Pbe{w-2(5E3TI$%v`IRS1BdH~32wlE>po zAbo%Zj@!N&qI+=X$zJA(iGnmaEyaiMc!KK%YRh*G4U;c!5tRrEpiCTUH6b<$iRS}D zLz-BHhf;PO(@FdasLY9pH<2dcHY2reYm~rU(V{W14NSkG`EEx)%Aj8C(R!e5$RK2- z1CS87kzI4f75r?~Uqk&RmlMfg$N#T@1mFq|05)r2#qIm{_La~pn0}E04Dd5vhCNX| z6D7dfh#L-`bNbwqngv^_#~|h5uLDkm@g@p7(##Pod7GIKzKISM9JWc0GsVl3&?4vK zR>D!FOepLa3<$-gu_(WNgH{=h6p+#mw5#Y8;8a~bpV69&3?0m?YlA5MVxe{!gs)FGm`QN0<9(n zNu~;~R?o=|7~H-pD*DA*ArYL3!4<6RrOP*Vyo~qsqVBp7>vbIjoQ@X~`bGRe?sygb zp5QyeWb+H6G0*E~Jt%rn&l9>3HnLru$+tsev!A%!yTFDg$C}DEd2E9ZXh(`z=s;H0 z?wnaIj+W{aJze=tzjP)?vLA3}}XSgPG!wz1{;GqBgSo~(Z0cJ?l zxx8f#B0l*vLoqQi^q4*pBZ{MjrvLx1B=K+FUx;ZiDXhMJ{W=ly?>CV%DQ!L{MU{f; z`TB>dDl)%9tny*|?{W`p3n+dn_UZG(wWeRoWDpgP5;lNoDL)q%*ZivgE{m-!YPXLZ zwdoKHrcg;>l-SkVE4owie}8ZQ1r>awc+E-gD2IE8@?+^0^)cUY`-=eV^>VCUuW{O%J;u^nS zO4~b;NT&cn3_D^B;zSrYO#l1XysH8?1C~yv>S-6ZjV013G}>fq|6N!hV^|+w-$d#E z`!r6PM+G4r(kC%h>jrNKEenF?ZN2|{d(NyiQ;eOQz609BJ2btR)?gO=bMQcl*ryGS zrUQM^zOO}c>h47JxVL=x@^&ev{zd8=-;0iN>wn9WhrYZSs3Bz90B~Ly$I{M}k4^KB zF$G{@Z>nBi+|+b-vyJ diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg deleted file mode 100644 index ebcbeee391ed7..0000000000000 --- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - Read-Mostly, Stale & - - Inconsistent Data OK - - (RCU Works Great!!!) - - (RCU Works Well) - - Read-Mostly, Need Consistent Data - - Read-Write, Need Consistent Data - - Update-Mostly, Need Consistent Data - - (RCU Might Be OK...) - - (1) Provide Existence Guarantees For Update-Friendly Mechanisms - - (2) Provide Wait-Free Read-Side Primitives for Real-Time Use) - - (RCU is Very Unlikely to be the Right Tool For The Job, But it Can: - - diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 01e12b86e81fd..c67a96a2a3894 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1120,12 +1120,27 @@ These classes is covered in the following sections.

    Specialization

    -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the +RCU is and always has been intended primarily for read-mostly situations, +which means that RCU's read-side primitives are optimized, often at the expense of its update-side primitives. +Experience thus far is captured by the following list of situations: -

    RCUApplicability.svg

    +
      +
    1. Read-mostly data, where stale and inconsistent data is not + a problem: RCU works great! +
    2. Read-mostly data, where data must be consistent: + RCU works well. +
    3. Read-write data, where data must be consistent: + RCU might work OK. + Or not. +
    4. Write-mostly data, where data must be consistent: + RCU is very unlikely to be the right tool for the job, + with the following exceptions, where RCU can provide: +
        +
      1. Existence guarantees for update-friendly mechanisms. +
      2. Wait-free read-side primitives for real-time use. +
      +

    This focus on read-mostly situations means that RCU must interoperate @@ -1171,10 +1186,7 @@ some period of time, so the exact wait period is a judgment call. One of our pair of veternarians might wait 30 seconds before pronouncing the cat dead, while the other might insist on waiting a full minute. The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -

    2013-08-is-it-dead.png

    +the final 30 seconds of the minute following the last heartbeat.

    Interestingly enough, this same situation applies to hardware. diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 3355f1f9384c9..d6a84f3e0451a 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -1257,12 +1257,27 @@ These classes is covered in the following sections.

    Specialization

    -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the +RCU is and always has been intended primarily for read-mostly situations, +which means that RCU's read-side primitives are optimized, often at the expense of its update-side primitives. +Experience thus far is captured by the following list of situations: -

    RCUApplicability.svg

    +
      +
    1. Read-mostly data, where stale and inconsistent data is not + a problem: RCU works great! +
    2. Read-mostly data, where data must be consistent: + RCU works well. +
    3. Read-write data, where data must be consistent: + RCU might work OK. + Or not. +
    4. Write-mostly data, where data must be consistent: + RCU is very unlikely to be the right tool for the job, + with the following exceptions, where RCU can provide: +
        +
      1. Existence guarantees for update-friendly mechanisms. +
      2. Wait-free read-side primitives for real-time use. +
      +

    This focus on read-mostly situations means that RCU must interoperate @@ -1330,10 +1345,7 @@ some period of time, so the exact wait period is a judgment call. One of our pair of veternarians might wait 30 seconds before pronouncing the cat dead, while the other might insist on waiting a full minute. The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -

    2013-08-is-it-dead.png

    +the final 30 seconds of the minute following the last heartbeat.

    Interestingly enough, this same situation applies to hardware. -- GitLab From 6146f8df48cb52c46c256424bd03b567b889b7bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Mar 2016 13:25:20 -0700 Subject: [PATCH 072/705] documentation: Get rid of duplicate .htmlx file This commit uses colors to obscure the quick-quiz answers, thus getting rid of the .htmlx file. Use your mouse to select the answer in order to see the text. Alternatively, use your favorite scripting language to remove all occurences of "" from the file. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 889 +++-- .../Design/Requirements/Requirements.htmlx | 2872 ----------------- Documentation/RCU/Design/htmlqqz.sh | 108 - 3 files changed, 428 insertions(+), 3441 deletions(-) delete mode 100644 Documentation/RCU/Design/Requirements/Requirements.htmlx delete mode 100755 Documentation/RCU/Design/htmlqqz.sh diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index c67a96a2a3894..acdad96f78e9c 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,3 @@ - - @@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements:

    This is followed by a summary, -which is in turn followed by the inevitable -answers to the quick quizzes. +however, the answers to each quick quiz immediately follows the quiz. +Select the big white space with your mouse to see the answer.

    Fundamental Requirements

    @@ -153,13 +151,27 @@ Therefore, the outcome: cannot happen. -

    Quick Quiz 1: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Wait a minute! + You said that updaters can make useful forward progress concurrently + with readers, but pre-existing readers will block + synchronize_rcu()!!! + Just who are you trying to fool??? +
    Answer:
    + First, if updaters do not wish to be blocked by readers, they can use + call_rcu() or kfree_rcu(), which will + be discussed later. + Second, even when using synchronize_rcu(), the other + update-side code does run concurrently with readers, whether + pre-existing or not. +
     

    This scenario resembles one of the first uses of RCU in @@ -210,9 +222,20 @@ to guarantee that do_something() never runs concurrently with recovery(), but with little or no synchronization overhead in do_something_dlm(). -

    Quick Quiz 2: -Why is the synchronize_rcu() on line 28 needed? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Why is the synchronize_rcu() on line 28 needed? +
    Answer:
    + Without that extra grace period, memory reordering could result in + do_something_dlm() executing do_something() + concurrently with the last bits of recovery(). +
     

    In order to avoid fatal problems such as deadlocks, @@ -332,12 +355,27 @@ It also prevents any number of “interesting” compiler optimizations, for example, the use of gp as a scratch location immediately preceding the assignment. -

    Quick Quiz 3: -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + But rcu_assign_pointer() does nothing to prevent the + two assignments to p->a and p->b + from being reordered. + Can't that also cause problems? +
    Answer:
    + No, it cannot. + The readers cannot see either of these two fields until + the assignment to gp, by which time both fields are + fully initialized. + So reordering the assignments + to p->a and p->b cannot possibly + cause any problems. +
     

    It is tempting to assume that the reader need not do anything special @@ -494,11 +532,42 @@ The rcu_access_pointer() on line 6 is similar to code protected by the corresponding update-side lock. -

    Quick Quiz 4: -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Without the rcu_dereference() or the + rcu_access_pointer(), what destructive optimizations + might the compiler make use of? +
    Answer:
    + Let's start with what happens to do_something_gp() + if it fails to use rcu_dereference(). + It could reuse a value formerly fetched from this same pointer. + It could also fetch the pointer from gp in a byte-at-a-time + manner, resulting in load tearing, in turn resulting a bytewise + mash-up of two distince pointer values. + It might even use value-speculation optimizations, where it makes + a wrong guess, but by the time it gets around to checking the + value, an update has changed the pointer to match the wrong guess. + Too bad about any dereferences that returned pre-initialization garbage + in the meantime! + + +

    + For remove_gp_synchronous(), as long as all modifications + to gp are carried out while holding gp_lock, + the above optimizations are harmless. + However, + with CONFIG_SPARSE_RCU_POINTER=y, + sparse will complain if you + define gp with __rcu and then + access it without using + either rcu_access_pointer() or rcu_dereference(). +

     

    In short, RCU's publish-subscribe guarantee is provided by the combination @@ -571,28 +640,156 @@ systems with more than one CPU: synchronize_rcu() migrates in the meantime. -

    Quick Quiz 5: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? -
    Answer - -

    Quick Quiz 6: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? -
    Answer - -

    Quick Quiz 7: -You claim that rcu_read_lock() and rcu_read_unlock() -generate absolutely no code in some kernel builds. -This means that the compiler might arbitrarily rearrange consecutive -RCU read-side critical sections. -Given such rearrangement, if a given RCU read-side critical section -is done, how can you be sure that all prior RCU read-side critical -sections are done? -Won't the compiler rearrangements make that impossible to determine? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Given that multiple CPUs can start RCU read-side critical sections + at any time without any ordering whatsoever, how can RCU possibly + tell whether or not a given RCU read-side critical section starts + before a given instance of synchronize_rcu()? +
    Answer:
    + If RCU cannot tell whether or not a given + RCU read-side critical section starts before a + given instance of synchronize_rcu(), + then it must assume that the RCU read-side critical section + started first. + In other words, a given instance of synchronize_rcu() + can avoid waiting on a given RCU read-side critical section only + if it can prove that synchronize_rcu() started first. +
     
    + + + + + + + + +
     
    Quick Quiz:
    + The first and second guarantees require unbelievably strict ordering! + Are all these memory barriers really required? +
    Answer:
    + Yes, they really are required. + To see why the first guarantee is required, consider the following + sequence of events: + + +
      +
    1. + CPU 1: rcu_read_lock() + +
    2. + CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ + +
    3. + CPU 0: list_del_rcu(p); + +
    4. + CPU 0: synchronize_rcu() starts. + +
    5. + CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ + +
    6. + CPU 1: rcu_read_unlock() + +
    7. + CPU 0: synchronize_rcu() returns. + +
    8. + CPU 0: kfree(p); + +
    + +

    + Therefore, there absolutely must be a full memory barrier between the + end of the RCU read-side critical section and the end of the + grace period. + + +

    + The sequence of events demonstrating the necessity of the second rule + is roughly similar: + + +

      +
    1. CPU 0: list_del_rcu(p); + +
    2. CPU 0: synchronize_rcu() starts. + +
    3. CPU 1: rcu_read_lock() + +
    4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ + +
    5. CPU 0: synchronize_rcu() returns. + +
    6. CPU 0: kfree(p); + +
    7. + CPU 1: do_something_with(q->a); /* Boom!!! */ + +
    8. CPU 1: rcu_read_unlock() + +
    + +

    + And similarly, without a memory barrier between the beginning of the + grace period and the beginning of the RCU read-side critical section, + CPU 1 might end up accessing the freelist. + + +

    + The “as if” rule of course applies, so that any + implementation that acts as if the appropriate memory barriers + were in place is a correct implementation. + That said, it is much easier to fool yourself into believing + that you have adhered to the as-if rule than it is to actually + adhere to it! +

     
    + + + + + + + + +
     
    Quick Quiz:
    + You claim that rcu_read_lock() and rcu_read_unlock() + generate absolutely no code in some kernel builds. + This means that the compiler might arbitrarily rearrange consecutive + RCU read-side critical sections. + Given such rearrangement, if a given RCU read-side critical section + is done, how can you be sure that all prior RCU read-side critical + sections are done? + Won't the compiler rearrangements make that impossible to determine? +
    Answer:
    + In cases where rcu_read_lock() and rcu_read_unlock() + generate absolutely no code, RCU infers quiescent states only at + special locations, for example, within the scheduler. + Because calls to schedule() had better prevent calling-code + accesses to shared variables from being rearranged across the call to + schedule(), if RCU detects the end of a given RCU read-side + critical section, it will necessarily detect the end of all prior + RCU read-side critical sections, no matter how aggressively the + compiler scrambles the code. + + +

    + Again, this all assumes that the compiler cannot scramble code across + calls to the scheduler, out of interrupt handlers, into the idle loop, + into user-mode code, and so on. + But if your kernel build allows that sort of scrambling, you have broken + far more than just RCU! +

     

    Note that these memory-barrier requirements do not replace the fundamental @@ -637,9 +834,19 @@ inconvenience can be avoided through use of the call_rcu() and kfree_rcu() API members described later in this document. -

    Quick Quiz 8: -But how does the upgrade-to-write operation exclude other readers? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + But how does the upgrade-to-write operation exclude other readers? +
    Answer:
    + It doesn't, just like normal RCU updates, which also do not exclude + RCU readers. +
     

    This guarantee allows lookup code to be shared between read-side @@ -725,9 +932,20 @@ to do significant reordering. This is by design: Any significant ordering constraints would slow down these fast-path APIs. -

    Quick Quiz 9: -Can't the compiler also reorder this code? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Can't the compiler also reorder this code? +
    Answer:
    + No, the volatile casts in READ_ONCE() and + WRITE_ONCE() prevent the compiler from reordering in + this particular case. +
     

    Readers Do Not Exclude Updaters

    @@ -780,10 +998,25 @@ new readers can start immediately after synchronize_rcu() starts, and synchronize_rcu() is under no obligation to wait for these new readers. -

    Quick Quiz 10: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Suppose that synchronize_rcu() did wait until all readers had completed. + Would the updater be able to rely on this? +
    Answer:
    + No. + Even if synchronize_rcu() were to wait until + all readers had completed, a new reader might start immediately after + synchronize_rcu() completed. + Therefore, the code following + synchronize_rcu() cannot rely on there being no readers + in any case. +
     

    Grace Periods Don't Partition Read-Side Critical Sections

    @@ -980,11 +1213,24 @@ grace period. As a result, an RCU read-side critical section cannot partition a pair of RCU grace periods. -

    Quick Quiz 11: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + How long a sequence of grace periods, each separated by an RCU + read-side critical section, would be required to partition the RCU + read-side critical sections at the beginning and end of the chain? +
    Answer:
    + In theory, an infinite number. + In practice, an unknown number that is sensitive to both implementation + details and timing considerations. + Therefore, even in practice, RCU users must abide by the + theoretical rather than the practical answer. +
     

    Disabling Preemption Does Not Block Grace Periods

    @@ -1153,9 +1399,43 @@ synchronization primitives be legal within RCU read-side critical sections, including spinlocks, sequence locks, atomic operations, reference counters, and memory barriers. -

    Quick Quiz 12: -What about sleeping locks? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + What about sleeping locks? +
    Answer:
    + These are forbidden within Linux-kernel RCU read-side critical + sections because it is not legal to place a quiescent state + (in this case, voluntary context switch) within an RCU read-side + critical section. + However, sleeping locks may be used within userspace RCU read-side + critical sections, and also within Linux-kernel sleepable RCU + (SRCU) + read-side critical sections. + In addition, the -rt patchset turns spinlocks into a + sleeping locks so that the corresponding critical sections + can be preempted, which also means that these sleeplockified + spinlocks (but not other sleeping locks!) may be acquire within + -rt-Linux-kernel RCU read-side critical sections. + + +

    + Note that it is legal for a normal RCU read-side + critical section to conditionally acquire a sleeping locks + (as in mutex_trylock()), but only as long as it does + not loop indefinitely attempting to conditionally acquire that + sleeping locks. + The key point is that things like mutex_trylock() + either return with the mutex held, or return an error indication if + the mutex was not immediately available. + Either way, mutex_trylock() returns immediately without + sleeping. +

     

    It often comes as a surprise that many algorithms do not require a @@ -1378,12 +1658,27 @@ write an RCU callback function that takes too long. Long-running operations should be relegated to separate threads or (in the Linux kernel) workqueues. -

    Quick Quiz 13: -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Why does line 19 use rcu_access_pointer()? + After all, call_rcu() on line 25 stores into the + structure, which would interact badly with concurrent insertions. + Doesn't this mean that rcu_dereference() is required? +
    Answer:
    + Presumably the ->gp_lock acquired on line 18 excludes + any changes, including any insertions that rcu_dereference() + would protect against. + Therefore, any insertions will be delayed until after + ->gp_lock + is released on line 25, which in turn means that + rcu_access_pointer() suffices. +
     

    However, all that remove_gp_cb() is doing is @@ -1430,14 +1725,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx, so the very few places that needed something like synchronize_rcu() simply open-coded it. -

    Quick Quiz 14: -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + Earlier it was claimed that call_rcu() and + kfree_rcu() allowed updaters to avoid being blocked + by readers. + But how can that be correct, given that the invocation of the callback + and the freeing of the memory (respectively) must still wait for + a grace period to elapse? +
    Answer:
    + We could define things this way, but keep in mind that this sort of + definition would say that updates in garbage-collected languages + cannot complete until the next time the garbage collector runs, + which does not seem at all reasonable. + The key point is that in most cases, an updater using either + call_rcu() or kfree_rcu() can proceed to the + next update as soon as it has invoked call_rcu() or + kfree_rcu(), without having to wait for a subsequent + grace period. +
     

    But what if the updater must wait for the completion of code to be @@ -1862,11 +2174,26 @@ kthreads to be spawned. Therefore, invoking synchronize_rcu() during scheduler initialization can result in deadlock. -

    Quick Quiz 15: -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + So what happens with synchronize_rcu() during + scheduler initialization for CONFIG_PREEMPT=n + kernels? +
    Answer:
    + In CONFIG_PREEMPT=n kernel, synchronize_rcu() + maps directly to synchronize_sched(). + Therefore, synchronize_rcu() works normally throughout + boot in CONFIG_PREEMPT=n kernels. + However, your code must also work in CONFIG_PREEMPT=y kernels, + so it is still necessary to avoid invoking synchronize_rcu() + during scheduler initialization. +
     

    I learned of these boot-time requirements as a result of a series of @@ -2571,10 +2898,23 @@ If you needed to wait on multiple different flavors of SRCU (but why???), you would need to create a wrapper function resembling call_my_srcu() for each SRCU flavor. -

    Quick Quiz 16: -But what if I need to wait for multiple RCU flavors, but I also need -the grace periods to be expedited? -
    Answer + + + + + + + +
     
    Quick Quiz:
    + But what if I need to wait for multiple RCU flavors, but I also need + the grace periods to be expedited? +
    Answer:
    + If you are using expedited grace periods, there should be less penalty + for waiting on them in succession. + But if that is nevertheless a problem, you can use workqueues + or multiple kthreads to wait on the various expedited grace + periods concurrently. +
     

    Again, it is usually better to adjust the RCU read-side critical sections @@ -2678,377 +3018,4 @@ and is provided under the terms of the Creative Commons Attribution-Share Alike 3.0 United States license. -

    -Answers to Quick Quizzes

    - - -

    Quick Quiz 1: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? - - -

    Answer: -First, if updaters do not wish to be blocked by readers, they can use -call_rcu() or kfree_rcu(), which will -be discussed later. -Second, even when using synchronize_rcu(), the other -update-side code does run concurrently with readers, whether pre-existing -or not. - - -

    Back to Quick Quiz 1. - - -

    Quick Quiz 2: -Why is the synchronize_rcu() on line 28 needed? - - -

    Answer: -Without that extra grace period, memory reordering could result in -do_something_dlm() executing do_something() -concurrently with the last bits of recovery(). - - -

    Back to Quick Quiz 2. - - -

    Quick Quiz 3: -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? - - -

    Answer: -No, it cannot. -The readers cannot see either of these two fields until -the assignment to gp, by which time both fields are -fully initialized. -So reordering the assignments -to p->a and p->b cannot possibly -cause any problems. - - -

    Back to Quick Quiz 3. - - -

    Quick Quiz 4: -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? - - -

    Answer: -Let's start with what happens to do_something_gp() -if it fails to use rcu_dereference(). -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from gp in a byte-at-a-time -manner, resulting in load tearing, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -

    -For remove_gp_synchronous(), as long as all modifications -to gp are carried out while holding gp_lock, -the above optimizations are harmless. -However, -with CONFIG_SPARSE_RCU_POINTER=y, -sparse will complain if you -define gp with __rcu and then -access it without using -either rcu_access_pointer() or rcu_dereference(). - - -

    Back to Quick Quiz 4. - - -

    Quick Quiz 5: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? - - -

    Answer: -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of synchronize_rcu(), -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of synchronize_rcu() -can avoid waiting on a given RCU read-side critical section only -if it can prove that synchronize_rcu() started first. - - -

    Back to Quick Quiz 5. - - -

    Quick Quiz 6: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? - - -

    Answer: -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -

      -
    1. CPU 1: rcu_read_lock() -
    2. CPU 1: q = rcu_dereference(gp); - /* Very likely to return p. */ -
    3. CPU 0: list_del_rcu(p); -
    4. CPU 0: synchronize_rcu() starts. -
    5. CPU 1: do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */ -
    6. CPU 1: rcu_read_unlock() -
    7. CPU 0: synchronize_rcu() returns. -
    8. CPU 0: kfree(p); -
    - -

    -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -

    -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -

      -
    1. CPU 0: list_del_rcu(p); -
    2. CPU 0: synchronize_rcu() starts. -
    3. CPU 1: rcu_read_lock() -
    4. CPU 1: q = rcu_dereference(gp); - /* Might return p if no memory barrier. */ -
    5. CPU 0: synchronize_rcu() returns. -
    6. CPU 0: kfree(p); -
    7. CPU 1: do_something_with(q->a); /* Boom!!! */ -
    8. CPU 1: rcu_read_unlock() -
    - -

    -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -

    -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! - - -

    Back to Quick Quiz 6. - - -

    Quick Quiz 7: -You claim that rcu_read_lock() and rcu_read_unlock() -generate absolutely no code in some kernel builds. -This means that the compiler might arbitrarily rearrange consecutive -RCU read-side critical sections. -Given such rearrangement, if a given RCU read-side critical section -is done, how can you be sure that all prior RCU read-side critical -sections are done? -Won't the compiler rearrangements make that impossible to determine? - - -

    Answer: -In cases where rcu_read_lock() and rcu_read_unlock() -generate absolutely no code, RCU infers quiescent states only at -special locations, for example, within the scheduler. -Because calls to schedule() had better prevent calling-code -accesses to shared variables from being rearranged across the call to -schedule(), if RCU detects the end of a given RCU read-side -critical section, it will necessarily detect the end of all prior -RCU read-side critical sections, no matter how aggressively the -compiler scrambles the code. - -

    -Again, this all assumes that the compiler cannot scramble code across -calls to the scheduler, out of interrupt handlers, into the idle loop, -into user-mode code, and so on. -But if your kernel build allows that sort of scrambling, you have broken -far more than just RCU! - - -

    Back to Quick Quiz 7. - - -

    Quick Quiz 8: -But how does the upgrade-to-write operation exclude other readers? - - -

    Answer: -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. - - -

    Back to Quick Quiz 8. - - -

    Quick Quiz 9: -Can't the compiler also reorder this code? - - -

    Answer: -No, the volatile casts in READ_ONCE() and -WRITE_ONCE() prevent the compiler from reordering in -this particular case. - - -

    Back to Quick Quiz 9. - - -

    Quick Quiz 10: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? - - -

    Answer: -No. -Even if synchronize_rcu() were to wait until -all readers had completed, a new reader might start immediately after -synchronize_rcu() completed. -Therefore, the code following -synchronize_rcu() cannot rely on there being no readers -in any case. - - -

    Back to Quick Quiz 10. - - -

    Quick Quiz 11: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? - - -

    Answer: -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. - - -

    Back to Quick Quiz 11. - - -

    Quick Quiz 12: -What about sleeping locks? - - -

    Answer: -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -(SRCU) -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -

    -Note that it is legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in mutex_trylock()), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like mutex_trylock() -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, mutex_trylock() returns immediately without sleeping. - - -

    Back to Quick Quiz 12. - - -

    Quick Quiz 13: -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? - - -

    Answer: -Presumably the ->gp_lock acquired on line 18 excludes -any changes, including any insertions that rcu_dereference() -would protect against. -Therefore, any insertions will be delayed until after ->gp_lock -is released on line 25, which in turn means that -rcu_access_pointer() suffices. - - -

    Back to Quick Quiz 13. - - -

    Quick Quiz 14: -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? - - -

    Answer: -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -call_rcu() or kfree_rcu() can proceed to the -next update as soon as it has invoked call_rcu() or -kfree_rcu(), without having to wait for a subsequent -grace period. - - -

    Back to Quick Quiz 14. - - -

    Quick Quiz 15: -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? - - -

    Answer: -In CONFIG_PREEMPT=n kernel, synchronize_rcu() -maps directly to synchronize_sched(). -Therefore, synchronize_rcu() works normally throughout -boot in CONFIG_PREEMPT=n kernels. -However, your code must also work in CONFIG_PREEMPT=y kernels, -so it is still necessary to avoid invoking synchronize_rcu() -during scheduler initialization. - - -

    Back to Quick Quiz 15. - - -

    Quick Quiz 16: -But what if I need to wait for multiple RCU flavors, but I also need -the grace periods to be expedited? - - -

    Answer: -If you are using expedited grace periods, there should be less penalty -for waiting on them in succession. -But if that is nevertheless a problem, you can use workqueues or multiple -kthreads to wait on the various expedited grace periods concurrently. - - -

    Back to Quick Quiz 16. - - diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx deleted file mode 100644 index d6a84f3e0451a..0000000000000 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ /dev/null @@ -1,2872 +0,0 @@ - - - A Tour Through RCU's Requirements [LWN.net] - - -

    A Tour Through RCU's Requirements

    - -

    Copyright IBM Corporation, 2015

    -

    Author: Paul E. McKenney

    -

    The initial version of this document appeared in the -LWN articles -here, -here, and -here.

    - -

    Introduction

    - -

    -Read-copy update (RCU) is a synchronization mechanism that is often -used as a replacement for reader-writer locking. -RCU is unusual in that updaters do not block readers, -which means that RCU's read-side primitives can be exceedingly fast -and scalable. -In addition, updaters can make useful forward progress concurrently -with readers. -However, all this concurrency between RCU readers and updaters does raise -the question of exactly what RCU readers are doing, which in turn -raises the question of exactly what RCU's requirements are. - -

    -This document therefore summarizes RCU's requirements, and can be thought -of as an informal, high-level specification for RCU. -It is important to understand that RCU's specification is primarily -empirical in nature; -in fact, I learned about many of these requirements the hard way. -This situation might cause some consternation, however, not only -has this learning process been a lot of fun, but it has also been -a great privilege to work with so many people willing to apply -technologies in interesting new ways. - -

    -All that aside, here are the categories of currently known RCU requirements: -

    - -
      -
    1. - Fundamental Requirements -
    2. Fundamental Non-Requirements -
    3. - Parallelism Facts of Life -
    4. - Quality-of-Implementation Requirements -
    5. - Linux Kernel Complications -
    6. - Software-Engineering Requirements -
    7. - Other RCU Flavors -
    8. - Possible Future Changes -
    - -

    -This is followed by a summary, -which is in turn followed by the inevitable -answers to the quick quizzes. - -

    Fundamental Requirements

    - -

    -RCU's fundamental requirements are the closest thing RCU has to hard -mathematical requirements. -These are: - -

      -
    1. - Grace-Period Guarantee -
    2. - Publish-Subscribe Guarantee -
    3. - Memory-Barrier Guarantees -
    4. - RCU Primitives Guaranteed to Execute Unconditionally -
    5. - Guaranteed Read-to-Write Upgrade -
    - -

    Grace-Period Guarantee

    - -

    -RCU's grace-period guarantee is unusual in being premeditated: -Jack Slingwine and I had this guarantee firmly in mind when we started -work on RCU (then called “rclock”) in the early 1990s. -That said, the past two decades of experience with RCU have produced -a much more detailed understanding of this guarantee. - -

    -RCU's grace-period guarantee allows updaters to wait for the completion -of all pre-existing RCU read-side critical sections. -An RCU read-side critical section -begins with the marker rcu_read_lock() and ends with -the marker rcu_read_unlock(). -These markers may be nested, and RCU treats a nested set as one -big RCU read-side critical section. -Production-quality implementations of rcu_read_lock() and -rcu_read_unlock() are extremely lightweight, and in -fact have exactly zero overhead in Linux kernels built for production -use with CONFIG_PREEMPT=n. - -

    -This guarantee allows ordering to be enforced with extremely low -overhead to readers, for example: - -

    -
    - 1 int x, y;
    - 2
    - 3 void thread0(void)
    - 4 {
    - 5   rcu_read_lock();
    - 6   r1 = READ_ONCE(x);
    - 7   r2 = READ_ONCE(y);
    - 8   rcu_read_unlock();
    - 9 }
    -10
    -11 void thread1(void)
    -12 {
    -13   WRITE_ONCE(x, 1);
    -14   synchronize_rcu();
    -15   WRITE_ONCE(y, 1);
    -16 }
    -
    -
    - -

    -Because the synchronize_rcu() on line 14 waits for -all pre-existing readers, any instance of thread0() that -loads a value of zero from x must complete before -thread1() stores to y, so that instance must -also load a value of zero from y. -Similarly, any instance of thread0() that loads a value of -one from y must have started after the -synchronize_rcu() started, and must therefore also load -a value of one from x. -Therefore, the outcome: -

    -
    -(r1 == 0 && r2 == 1)
    -
    -
    -cannot happen. - -

    @@QQ@@ -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -synchronize_rcu()!!! -Just who are you trying to fool??? -

    @@QQA@@ -First, if updaters do not wish to be blocked by readers, they can use -call_rcu() or kfree_rcu(), which will -be discussed later. -Second, even when using synchronize_rcu(), the other -update-side code does run concurrently with readers, whether pre-existing -or not. -

    @@QQE@@ - -

    -This scenario resembles one of the first uses of RCU in -DYNIX/ptx, -which managed a distributed lock manager's transition into -a state suitable for handling recovery from node failure, -more or less as follows: - -

    -
    - 1 #define STATE_NORMAL        0
    - 2 #define STATE_WANT_RECOVERY 1
    - 3 #define STATE_RECOVERING    2
    - 4 #define STATE_WANT_NORMAL   3
    - 5
    - 6 int state = STATE_NORMAL;
    - 7
    - 8 void do_something_dlm(void)
    - 9 {
    -10   int state_snap;
    -11
    -12   rcu_read_lock();
    -13   state_snap = READ_ONCE(state);
    -14   if (state_snap == STATE_NORMAL)
    -15     do_something();
    -16   else
    -17     do_something_carefully();
    -18   rcu_read_unlock();
    -19 }
    -20
    -21 void start_recovery(void)
    -22 {
    -23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
    -24   synchronize_rcu();
    -25   WRITE_ONCE(state, STATE_RECOVERING);
    -26   recovery();
    -27   WRITE_ONCE(state, STATE_WANT_NORMAL);
    -28   synchronize_rcu();
    -29   WRITE_ONCE(state, STATE_NORMAL);
    -30 }
    -
    -
    - -

    -The RCU read-side critical section in do_something_dlm() -works with the synchronize_rcu() in start_recovery() -to guarantee that do_something() never runs concurrently -with recovery(), but with little or no synchronization -overhead in do_something_dlm(). - -

    @@QQ@@ -Why is the synchronize_rcu() on line 28 needed? -

    @@QQA@@ -Without that extra grace period, memory reordering could result in -do_something_dlm() executing do_something() -concurrently with the last bits of recovery(). -

    @@QQE@@ - -

    -In order to avoid fatal problems such as deadlocks, -an RCU read-side critical section must not contain calls to -synchronize_rcu(). -Similarly, an RCU read-side critical section must not -contain anything that waits, directly or indirectly, on completion of -an invocation of synchronize_rcu(). - -

    -Although RCU's grace-period guarantee is useful in and of itself, with -quite a few use cases, -it would be good to be able to use RCU to coordinate read-side -access to linked data structures. -For this, the grace-period guarantee is not sufficient, as can -be seen in function add_gp_buggy() below. -We will look at the reader's code later, but in the meantime, just think of -the reader as locklessly picking up the gp pointer, -and, if the value loaded is non-NULL, locklessly accessing the -->a and ->b fields. - -

    -
    - 1 bool add_gp_buggy(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   p->a = a;
    -12   p->b = a;
    -13   gp = p; /* ORDERING BUG */
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -The problem is that both the compiler and weakly ordered CPUs are within -their rights to reorder this code as follows: - -

    -
    - 1 bool add_gp_buggy_optimized(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   gp = p; /* ORDERING BUG */
    -12   p->a = a;
    -13   p->b = a;
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -If an RCU reader fetches gp just after -add_gp_buggy_optimized executes line 11, -it will see garbage in the ->a and ->b -fields. -And this is but one of many ways in which compiler and hardware optimizations -could cause trouble. -Therefore, we clearly need some way to prevent the compiler and the CPU from -reordering in this manner, which brings us to the publish-subscribe -guarantee discussed in the next section. - -

    Publish/Subscribe Guarantee

    - -

    -RCU's publish-subscribe guarantee allows data to be inserted -into a linked data structure without disrupting RCU readers. -The updater uses rcu_assign_pointer() to insert the -new data, and readers use rcu_dereference() to -access data, whether new or old. -The following shows an example of insertion: - -

    -
    - 1 bool add_gp(int a, int b)
    - 2 {
    - 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
    - 4   if (!p)
    - 5     return -ENOMEM;
    - 6   spin_lock(&gp_lock);
    - 7   if (rcu_access_pointer(gp)) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   p->a = a;
    -12   p->b = a;
    -13   rcu_assign_pointer(gp, p);
    -14   spin_unlock(&gp_lock);
    -15   return true;
    -16 }
    -
    -
    - -

    -The rcu_assign_pointer() on line 13 is conceptually -equivalent to a simple assignment statement, but also guarantees -that its assignment will -happen after the two assignments in lines 11 and 12, -similar to the C11 memory_order_release store operation. -It also prevents any number of “interesting” compiler -optimizations, for example, the use of gp as a scratch -location immediately preceding the assignment. - -

    @@QQ@@ -But rcu_assign_pointer() does nothing to prevent the -two assignments to p->a and p->b -from being reordered. -Can't that also cause problems? -

    @@QQA@@ -No, it cannot. -The readers cannot see either of these two fields until -the assignment to gp, by which time both fields are -fully initialized. -So reordering the assignments -to p->a and p->b cannot possibly -cause any problems. -

    @@QQE@@ - -

    -It is tempting to assume that the reader need not do anything special -to control its accesses to the RCU-protected data, -as shown in do_something_gp_buggy() below: - -

    -
    - 1 bool do_something_gp_buggy(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
    - 5   if (p) {
    - 6     do_something(p->a, p->b);
    - 7     rcu_read_unlock();
    - 8     return true;
    - 9   }
    -10   rcu_read_unlock();
    -11   return false;
    -12 }
    -
    -
    - -

    -However, this temptation must be resisted because there are a -surprisingly large number of ways that the compiler -(to say nothing of -DEC Alpha CPUs) -can trip this code up. -For but one example, if the compiler were short of registers, it -might choose to refetch from gp rather than keeping -a separate copy in p as follows: - -

    -
    - 1 bool do_something_gp_buggy_optimized(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
    - 5     do_something(gp->a, gp->b);
    - 6     rcu_read_unlock();
    - 7     return true;
    - 8   }
    - 9   rcu_read_unlock();
    -10   return false;
    -11 }
    -
    -
    - -

    -If this function ran concurrently with a series of updates that -replaced the current structure with a new one, -the fetches of gp->a -and gp->b might well come from two different structures, -which could cause serious confusion. -To prevent this (and much else besides), do_something_gp() uses -rcu_dereference() to fetch from gp: - -

    -
    - 1 bool do_something_gp(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   p = rcu_dereference(gp);
    - 5   if (p) {
    - 6     do_something(p->a, p->b);
    - 7     rcu_read_unlock();
    - 8     return true;
    - 9   }
    -10   rcu_read_unlock();
    -11   return false;
    -12 }
    -
    -
    - -

    -The rcu_dereference() uses volatile casts and (for DEC Alpha) -memory barriers in the Linux kernel. -Should a -high-quality implementation of C11 memory_order_consume [PDF] -ever appear, then rcu_dereference() could be implemented -as a memory_order_consume load. -Regardless of the exact implementation, a pointer fetched by -rcu_dereference() may not be used outside of the -outermost RCU read-side critical section containing that -rcu_dereference(), unless protection of -the corresponding data element has been passed from RCU to some -other synchronization mechanism, most commonly locking or -reference counting. - -

    -In short, updaters use rcu_assign_pointer() and readers -use rcu_dereference(), and these two RCU API elements -work together to ensure that readers have a consistent view of -newly added data elements. - -

    -Of course, it is also necessary to remove elements from RCU-protected -data structures, for example, using the following process: - -

      -
    1. Remove the data element from the enclosing structure. -
    2. Wait for all pre-existing RCU read-side critical sections - to complete (because only pre-existing readers can possibly have - a reference to the newly removed data element). -
    3. At this point, only the updater has a reference to the - newly removed data element, so it can safely reclaim - the data element, for example, by passing it to kfree(). -
    - -This process is implemented by remove_gp_synchronous(): - -
    -
    - 1 bool remove_gp_synchronous(void)
    - 2 {
    - 3   struct foo *p;
    - 4
    - 5   spin_lock(&gp_lock);
    - 6   p = rcu_access_pointer(gp);
    - 7   if (!p) {
    - 8     spin_unlock(&gp_lock);
    - 9     return false;
    -10   }
    -11   rcu_assign_pointer(gp, NULL);
    -12   spin_unlock(&gp_lock);
    -13   synchronize_rcu();
    -14   kfree(p);
    -15   return true;
    -16 }
    -
    -
    - -

    -This function is straightforward, with line 13 waiting for a grace -period before line 14 frees the old data element. -This waiting ensures that readers will reach line 7 of -do_something_gp() before the data element referenced by -p is freed. -The rcu_access_pointer() on line 6 is similar to -rcu_dereference(), except that: - -

      -
    1. The value returned by rcu_access_pointer() - cannot be dereferenced. - If you want to access the value pointed to as well as - the pointer itself, use rcu_dereference() - instead of rcu_access_pointer(). -
    2. The call to rcu_access_pointer() need not be - protected. - In contrast, rcu_dereference() must either be - within an RCU read-side critical section or in a code - segment where the pointer cannot change, for example, in - code protected by the corresponding update-side lock. -
    - -

    @@QQ@@ -Without the rcu_dereference() or the -rcu_access_pointer(), what destructive optimizations -might the compiler make use of? -

    @@QQA@@ -Let's start with what happens to do_something_gp() -if it fails to use rcu_dereference(). -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from gp in a byte-at-a-time -manner, resulting in load tearing, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -

    -For remove_gp_synchronous(), as long as all modifications -to gp are carried out while holding gp_lock, -the above optimizations are harmless. -However, -with CONFIG_SPARSE_RCU_POINTER=y, -sparse will complain if you -define gp with __rcu and then -access it without using -either rcu_access_pointer() or rcu_dereference(). -

    @@QQE@@ - -

    -In short, RCU's publish-subscribe guarantee is provided by the combination -of rcu_assign_pointer() and rcu_dereference(). -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -

    -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling rcu_dereference() for subscription, nor did it -have anything resembling the smp_read_barrier_depends() -that was later subsumed into rcu_dereference(). -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good two hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting rcu_dereference()! - -

    Memory-Barrier Guarantees

    - -

    -The previous section's simple linked-data-structure scenario clearly -demonstrates the need for RCU's stringent memory-ordering guarantees on -systems with more than one CPU: - -

      -
    1. Each CPU that has an RCU read-side critical section that - begins before synchronize_rcu() starts is - guaranteed to execute a full memory barrier between the time - that the RCU read-side critical section ends and the time that - synchronize_rcu() returns. - Without this guarantee, a pre-existing RCU read-side critical section - might hold a reference to the newly removed struct foo - after the kfree() on line 14 of - remove_gp_synchronous(). -
    2. Each CPU that has an RCU read-side critical section that ends - after synchronize_rcu() returns is guaranteed - to execute a full memory barrier between the time that - synchronize_rcu() begins and the time that the RCU - read-side critical section begins. - Without this guarantee, a later RCU read-side critical section - running after the kfree() on line 14 of - remove_gp_synchronous() might - later run do_something_gp() and find the - newly deleted struct foo. -
    3. If the task invoking synchronize_rcu() remains - on a given CPU, then that CPU is guaranteed to execute a full - memory barrier sometime during the execution of - synchronize_rcu(). - This guarantee ensures that the kfree() on - line 14 of remove_gp_synchronous() really does - execute after the removal on line 11. -
    4. If the task invoking synchronize_rcu() migrates - among a group of CPUs during that invocation, then each of the - CPUs in that group is guaranteed to execute a full memory barrier - sometime during the execution of synchronize_rcu(). - This guarantee also ensures that the kfree() on - line 14 of remove_gp_synchronous() really does - execute after the removal on - line 11, but also in the case where the thread executing the - synchronize_rcu() migrates in the meantime. -
    - -

    @@QQ@@ -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of synchronize_rcu()? -

    @@QQA@@ -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of synchronize_rcu(), -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of synchronize_rcu() -can avoid waiting on a given RCU read-side critical section only -if it can prove that synchronize_rcu() started first. -

    @@QQE@@ - -

    @@QQ@@ -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers really required? -

    @@QQA@@ -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -

      -
    1. CPU 1: rcu_read_lock() -
    2. CPU 1: q = rcu_dereference(gp); - /* Very likely to return p. */ -
    3. CPU 0: list_del_rcu(p); -
    4. CPU 0: synchronize_rcu() starts. -
    5. CPU 1: do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */ -
    6. CPU 1: rcu_read_unlock() -
    7. CPU 0: synchronize_rcu() returns. -
    8. CPU 0: kfree(p); -
    - -

    -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -

    -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -

      -
    1. CPU 0: list_del_rcu(p); -
    2. CPU 0: synchronize_rcu() starts. -
    3. CPU 1: rcu_read_lock() -
    4. CPU 1: q = rcu_dereference(gp); - /* Might return p if no memory barrier. */ -
    5. CPU 0: synchronize_rcu() returns. -
    6. CPU 0: kfree(p); -
    7. CPU 1: do_something_with(q->a); /* Boom!!! */ -
    8. CPU 1: rcu_read_unlock() -
    - -

    -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -

    -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! -

    @@QQE@@ - -

    @@QQ@@ -You claim that rcu_read_lock() and rcu_read_unlock() -generate absolutely no code in some kernel builds. -This means that the compiler might arbitrarily rearrange consecutive -RCU read-side critical sections. -Given such rearrangement, if a given RCU read-side critical section -is done, how can you be sure that all prior RCU read-side critical -sections are done? -Won't the compiler rearrangements make that impossible to determine? -

    @@QQA@@ -In cases where rcu_read_lock() and rcu_read_unlock() -generate absolutely no code, RCU infers quiescent states only at -special locations, for example, within the scheduler. -Because calls to schedule() had better prevent calling-code -accesses to shared variables from being rearranged across the call to -schedule(), if RCU detects the end of a given RCU read-side -critical section, it will necessarily detect the end of all prior -RCU read-side critical sections, no matter how aggressively the -compiler scrambles the code. - -

    -Again, this all assumes that the compiler cannot scramble code across -calls to the scheduler, out of interrupt handlers, into the idle loop, -into user-mode code, and so on. -But if your kernel build allows that sort of scrambling, you have broken -far more than just RCU! -

    @@QQE@@ - -

    -Note that these memory-barrier requirements do not replace the fundamental -RCU requirement that a grace period wait for all pre-existing readers. -On the contrary, the memory barriers called out in this section must operate in -such a way as to enforce this fundamental requirement. -Of course, different implementations enforce this requirement in different -ways, but enforce it they must. - -

    RCU Primitives Guaranteed to Execute Unconditionally

    - -

    -The common-case RCU primitives are unconditional. -They are invoked, they do their job, and they return, with no possibility -of error, and no need to retry. -This is a key RCU design philosophy. - -

    -However, this philosophy is pragmatic rather than pigheaded. -If someone comes up with a good justification for a particular conditional -RCU primitive, it might well be implemented and added. -After all, this guarantee was reverse-engineered, not premeditated. -The unconditional nature of the RCU primitives was initially an -accident of implementation, and later experience with synchronization -primitives with conditional primitives caused me to elevate this -accident to a guarantee. -Therefore, the justification for adding a conditional primitive to -RCU would need to be based on detailed and compelling use cases. - -

    Guaranteed Read-to-Write Upgrade

    - -

    -As far as RCU is concerned, it is always possible to carry out an -update within an RCU read-side critical section. -For example, that RCU read-side critical section might search for -a given data element, and then might acquire the update-side -spinlock in order to update that element, all while remaining -in that RCU read-side critical section. -Of course, it is necessary to exit the RCU read-side critical section -before invoking synchronize_rcu(), however, this -inconvenience can be avoided through use of the -call_rcu() and kfree_rcu() API members -described later in this document. - -

    @@QQ@@ -But how does the upgrade-to-write operation exclude other readers? -

    @@QQA@@ -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. -

    @@QQE@@ - -

    -This guarantee allows lookup code to be shared between read-side -and update-side code, and was premeditated, appearing in the earliest -DYNIX/ptx RCU documentation. - -

    Fundamental Non-Requirements

    - -

    -RCU provides extremely lightweight readers, and its read-side guarantees, -though quite useful, are correspondingly lightweight. -It is therefore all too easy to assume that RCU is guaranteeing more -than it really is. -Of course, the list of things that RCU does not guarantee is infinitely -long, however, the following sections list a few non-guarantees that -have caused confusion. -Except where otherwise noted, these non-guarantees were premeditated. - -

      -
    1. - Readers Impose Minimal Ordering -
    2. - Readers Do Not Exclude Updaters -
    3. - Updaters Only Wait For Old Readers -
    4. - Grace Periods Don't Partition Read-Side Critical Sections -
    5. - Read-Side Critical Sections Don't Partition Grace Periods -
    6. - Disabling Preemption Does Not Block Grace Periods -
    - -

    Readers Impose Minimal Ordering

    - -

    -Reader-side markers such as rcu_read_lock() and -rcu_read_unlock() provide absolutely no ordering guarantees -except through their interaction with the grace-period APIs such as -synchronize_rcu(). -To see this, consider the following pair of threads: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(x, 1);
    - 5   rcu_read_unlock();
    - 6   rcu_read_lock();
    - 7   WRITE_ONCE(y, 1);
    - 8   rcu_read_unlock();
    - 9 }
    -10
    -11 void thread1(void)
    -12 {
    -13   rcu_read_lock();
    -14   r1 = READ_ONCE(y);
    -15   rcu_read_unlock();
    -16   rcu_read_lock();
    -17   r2 = READ_ONCE(x);
    -18   rcu_read_unlock();
    -19 }
    -
    -
    - -

    -After thread0() and thread1() execute -concurrently, it is quite possible to have - -

    -
    -(r1 == 1 && r2 == 0)
    -
    -
    - -(that is, y appears to have been assigned before x), -which would not be possible if rcu_read_lock() and -rcu_read_unlock() had much in the way of ordering -properties. -But they do not, so the CPU is within its rights -to do significant reordering. -This is by design: Any significant ordering constraints would slow down -these fast-path APIs. - -

    @@QQ@@ -Can't the compiler also reorder this code? -

    @@QQA@@ -No, the volatile casts in READ_ONCE() and -WRITE_ONCE() prevent the compiler from reordering in -this particular case. -

    @@QQE@@ - -

    Readers Do Not Exclude Updaters

    - -

    -Neither rcu_read_lock() nor rcu_read_unlock() -exclude updates. -All they do is to prevent grace periods from ending. -The following example illustrates this: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   r1 = READ_ONCE(y);
    - 5   if (r1) {
    - 6     do_something_with_nonzero_x();
    - 7     r2 = READ_ONCE(x);
    - 8     WARN_ON(!r2); /* BUG!!! */
    - 9   }
    -10   rcu_read_unlock();
    -11 }
    -12
    -13 void thread1(void)
    -14 {
    -15   spin_lock(&my_lock);
    -16   WRITE_ONCE(x, 1);
    -17   WRITE_ONCE(y, 1);
    -18   spin_unlock(&my_lock);
    -19 }
    -
    -
    - -

    -If the thread0() function's rcu_read_lock() -excluded the thread1() function's update, -the WARN_ON() could never fire. -But the fact is that rcu_read_lock() does not exclude -much of anything aside from subsequent grace periods, of which -thread1() has none, so the -WARN_ON() can and does fire. - -

    Updaters Only Wait For Old Readers

    - -

    -It might be tempting to assume that after synchronize_rcu() -completes, there are no readers executing. -This temptation must be avoided because -new readers can start immediately after synchronize_rcu() -starts, and synchronize_rcu() is under no -obligation to wait for these new readers. - -

    @@QQ@@ -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -

    @@QQA@@ -No. -Even if synchronize_rcu() were to wait until -all readers had completed, a new reader might start immediately after -synchronize_rcu() completed. -Therefore, the code following -synchronize_rcu() cannot rely on there being no readers -in any case. -

    @@QQE@@ - -

    -Grace Periods Don't Partition Read-Side Critical Sections

    - -

    -It is tempting to assume that if any part of one RCU read-side critical -section precedes a given grace period, and if any part of another RCU -read-side critical section follows that same grace period, then all of -the first RCU read-side critical section must precede all of the second. -However, this just isn't the case: A single grace period does not -partition the set of RCU read-side critical sections. -An example of this situation can be illustrated as follows, where -x, y, and z are initially all zero: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   rcu_read_lock();
    -19   r2 = READ_ONCE(b);
    -20   r3 = READ_ONCE(c);
    -21   rcu_read_unlock();
    -22 }
    -
    -
    - -

    -It turns out that the outcome: - -

    -
    -(r1 == 1 && r2 == 0 && r3 == 1)
    -
    -
    - -is entirely possible. -The following figure show how this can happen, with each circled -QS indicating the point at which RCU recorded a -quiescent state for each thread, that is, a state in which -RCU knows that the thread cannot be in the midst of an RCU read-side -critical section that started before the current grace period: - -

    GPpartitionReaders1.svg

    - -

    -If it is necessary to partition RCU read-side critical sections in this -manner, it is necessary to use two grace periods, where the first -grace period is known to end before the second grace period starts: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   r2 = READ_ONCE(c);
    -19   synchronize_rcu();
    -20   WRITE_ONCE(d, 1);
    -21 }
    -22
    -23 void thread3(void)
    -24 {
    -25   rcu_read_lock();
    -26   r3 = READ_ONCE(b);
    -27   r4 = READ_ONCE(d);
    -28   rcu_read_unlock();
    -29 }
    -
    -
    - -

    -Here, if (r1 == 1), then -thread0()'s write to b must happen -before the end of thread1()'s grace period. -If in addition (r4 == 1), then -thread3()'s read from b must happen -after the beginning of thread2()'s grace period. -If it is also the case that (r2 == 1), then the -end of thread1()'s grace period must precede the -beginning of thread2()'s grace period. -This mean that the two RCU read-side critical sections cannot overlap, -guaranteeing that (r3 == 1). -As a result, the outcome: - -

    -
    -(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
    -
    -
    - -cannot happen. - -

    -This non-requirement was also non-premeditated, but became apparent -when studying RCU's interaction with memory ordering. - -

    -Read-Side Critical Sections Don't Partition Grace Periods

    - -

    -It is also tempting to assume that if an RCU read-side critical section -happens between a pair of grace periods, then those grace periods cannot -overlap. -However, this temptation leads nowhere good, as can be illustrated by -the following, with all variables initially zero: - -

    -
    - 1 void thread0(void)
    - 2 {
    - 3   rcu_read_lock();
    - 4   WRITE_ONCE(a, 1);
    - 5   WRITE_ONCE(b, 1);
    - 6   rcu_read_unlock();
    - 7 }
    - 8
    - 9 void thread1(void)
    -10 {
    -11   r1 = READ_ONCE(a);
    -12   synchronize_rcu();
    -13   WRITE_ONCE(c, 1);
    -14 }
    -15
    -16 void thread2(void)
    -17 {
    -18   rcu_read_lock();
    -19   WRITE_ONCE(d, 1);
    -20   r2 = READ_ONCE(c);
    -21   rcu_read_unlock();
    -22 }
    -23
    -24 void thread3(void)
    -25 {
    -26   r3 = READ_ONCE(d);
    -27   synchronize_rcu();
    -28   WRITE_ONCE(e, 1);
    -29 }
    -30
    -31 void thread4(void)
    -32 {
    -33   rcu_read_lock();
    -34   r4 = READ_ONCE(b);
    -35   r5 = READ_ONCE(e);
    -36   rcu_read_unlock();
    -37 }
    -
    -
    - -

    -In this case, the outcome: - -

    -
    -(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
    -
    -
    - -is entirely possible, as illustrated below: - -

    ReadersPartitionGP1.svg

    - -

    -Again, an RCU read-side critical section can overlap almost all of a -given grace period, just so long as it does not overlap the entire -grace period. -As a result, an RCU read-side critical section cannot partition a pair -of RCU grace periods. - -

    @@QQ@@ -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -

    @@QQA@@ -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. -

    @@QQE@@ - -

    -Disabling Preemption Does Not Block Grace Periods

    - -

    -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -demonstrated. - -

    -If you need a preempt-disable region to block grace periods, you need to add -rcu_read_lock() and rcu_read_unlock(), for example -as follows: - -

    -
    - 1 preempt_disable();
    - 2 rcu_read_lock();
    - 3 do_something();
    - 4 rcu_read_unlock();
    - 5 preempt_enable();
    - 6
    - 7 /* Spinlocks implicitly disable preemption. */
    - 8 spin_lock(&mylock);
    - 9 rcu_read_lock();
    -10 do_something();
    -11 rcu_read_unlock();
    -12 spin_unlock(&mylock);
    -
    -
    - -

    -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces rcu_read_unlock() to do -more work. -And no, this is not an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -

    -This non-requirement appeared with preemptible RCU. -If you need a grace period that waits on non-preemptible code regions, use -RCU-sched. - -

    Parallelism Facts of Life

    - -

    -These parallelism facts of life are by no means specific to RCU, but -the RCU implementation must abide by them. -They therefore bear repeating: - -

      -
    1. Any CPU or task may be delayed at any time, - and any attempts to avoid these delays by disabling - preemption, interrupts, or whatever are completely futile. - This is most obvious in preemptible user-level - environments and in virtualized environments (where - a given guest OS's VCPUs can be preempted at any time by - the underlying hypervisor), but can also happen in bare-metal - environments due to ECC errors, NMIs, and other hardware - events. - Although a delay of more than about 20 seconds can result - in splats, the RCU implementation is obligated to use - algorithms that can tolerate extremely long delays, but where - “extremely long” is not long enough to allow - wrap-around when incrementing a 64-bit counter. -
    2. Both the compiler and the CPU can reorder memory accesses. - Where it matters, RCU must use compiler directives and - memory-barrier instructions to preserve ordering. -
    3. Conflicting writes to memory locations in any given cache line - will result in expensive cache misses. - Greater numbers of concurrent writes and more-frequent - concurrent writes will result in more dramatic slowdowns. - RCU is therefore obligated to use algorithms that have - sufficient locality to avoid significant performance and - scalability problems. -
    4. As a rough rule of thumb, only one CPU's worth of processing - may be carried out under the protection of any given exclusive - lock. - RCU must therefore use scalable locking designs. -
    5. Counters are finite, especially on 32-bit systems. - RCU's use of counters must therefore tolerate counter wrap, - or be designed such that counter wrap would take way more - time than a single system is likely to run. - An uptime of ten years is quite possible, a runtime - of a century much less so. - As an example of the latter, RCU's dyntick-idle nesting counter - allows 54 bits for interrupt nesting level (this counter - is 64 bits even on a 32-bit system). - Overflowing this counter requires 254 - half-interrupts on a given CPU without that CPU ever going idle. - If a half-interrupt happened every microsecond, it would take - 570 years of runtime to overflow this counter, which is currently - believed to be an acceptably long time. -
    6. Linux systems can have thousands of CPUs running a single - Linux kernel in a single shared-memory environment. - RCU must therefore pay close attention to high-end scalability. -
    - -

    -This last parallelism fact of life means that RCU must pay special -attention to the preceding facts of life. -The idea that Linux might scale to systems with thousands of CPUs would -have been met with some skepticism in the 1990s, but these requirements -would have otherwise have been unsurprising, even in the early 1990s. - -

    Quality-of-Implementation Requirements

    - -

    -These sections list quality-of-implementation requirements. -Although an RCU implementation that ignores these requirements could -still be used, it would likely be subject to limitations that would -make it inappropriate for industrial-strength production use. -Classes of quality-of-implementation requirements are as follows: - -

      -
    1. Specialization -
    2. Performance and Scalability -
    3. Composability -
    4. Corner Cases -
    - -

    -These classes is covered in the following sections. - -

    Specialization

    - -

    -RCU is and always has been intended primarily for read-mostly situations, -which means that RCU's read-side primitives are optimized, often at the -expense of its update-side primitives. -Experience thus far is captured by the following list of situations: - -

      -
    1. Read-mostly data, where stale and inconsistent data is not - a problem: RCU works great! -
    2. Read-mostly data, where data must be consistent: - RCU works well. -
    3. Read-write data, where data must be consistent: - RCU might work OK. - Or not. -
    4. Write-mostly data, where data must be consistent: - RCU is very unlikely to be the right tool for the job, - with the following exceptions, where RCU can provide: -
        -
      1. Existence guarantees for update-friendly mechanisms. -
      2. Wait-free read-side primitives for real-time use. -
      -
    - -

    -This focus on read-mostly situations means that RCU must interoperate -with other synchronization primitives. -For example, the add_gp() and remove_gp_synchronous() -examples discussed earlier use RCU to protect readers and locking to -coordinate updaters. -However, the need extends much farther, requiring that a variety of -synchronization primitives be legal within RCU read-side critical sections, -including spinlocks, sequence locks, atomic operations, reference -counters, and memory barriers. - -

    @@QQ@@ -What about sleeping locks? -

    @@QQA@@ -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -(SRCU) -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -

    -Note that it is legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in mutex_trylock()), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like mutex_trylock() -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, mutex_trylock() returns immediately without sleeping. -

    @@QQE@@ - -

    -It often comes as a surprise that many algorithms do not require a -consistent view of data, but many can function in that mode, -with network routing being the poster child. -Internet routing algorithms take significant time to propagate -updates, so that by the time an update arrives at a given system, -that system has been sending network traffic the wrong way for -a considerable length of time. -Having a few threads continue to send traffic the wrong way for a -few more milliseconds is clearly not a problem: In the worst case, -TCP retransmissions will eventually get the data where it needs to go. -In general, when tracking the state of the universe outside of the -computer, some level of inconsistency must be tolerated due to -speed-of-light delays if nothing else. - -

    -Furthermore, uncertainty about external state is inherent in many cases. -For example, a pair of veternarians might use heartbeat to determine -whether or not a given cat was alive. -But how long should they wait after the last heartbeat to decide that -the cat is in fact dead? -Waiting less than 400 milliseconds makes no sense because this would -mean that a relaxed cat would be considered to cycle between death -and life more than 100 times per minute. -Moreover, just as with human beings, a cat's heart might stop for -some period of time, so the exact wait period is a judgment call. -One of our pair of veternarians might wait 30 seconds before pronouncing -the cat dead, while the other might insist on waiting a full minute. -The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat. - -

    -Interestingly enough, this same situation applies to hardware. -When push comes to shove, how do we tell whether or not some -external server has failed? -We send messages to it periodically, and declare it failed if we -don't receive a response within a given period of time. -Policy decisions can usually tolerate short -periods of inconsistency. -The policy was decided some time ago, and is only now being put into -effect, so a few milliseconds of delay is normally inconsequential. - -

    -However, there are algorithms that absolutely must see consistent data. -For example, the translation between a user-level SystemV semaphore -ID to the corresponding in-kernel data structure is protected by RCU, -but it is absolutely forbidden to update a semaphore that has just been -removed. -In the Linux kernel, this need for consistency is accommodated by acquiring -spinlocks located in the in-kernel data structure from within -the RCU read-side critical section, and this is indicated by the -green box in the figure above. -Many other techniques may be used, and are in fact used within the -Linux kernel. - -

    -In short, RCU is not required to maintain consistency, and other -mechanisms may be used in concert with RCU when consistency is required. -RCU's specialization allows it to do its job extremely well, and its -ability to interoperate with other synchronization mechanisms allows -the right mix of synchronization tools to be used for a given job. - -

    Performance and Scalability

    - -

    -Energy efficiency is a critical component of performance today, -and Linux-kernel RCU implementations must therefore avoid unnecessarily -awakening idle CPUs. -I cannot claim that this requirement was premeditated. -In fact, I learned of it during a telephone conversation in which I -was given “frank and open” feedback on the importance -of energy efficiency in battery-powered systems and on specific -energy-efficiency shortcomings of the Linux-kernel RCU implementation. -In my experience, the battery-powered embedded community will consider -any unnecessary wakeups to be extremely unfriendly acts. -So much so that mere Linux-kernel-mailing-list posts are -insufficient to vent their ire. - -

    -Memory consumption is not particularly important for in most -situations, and has become decreasingly -so as memory sizes have expanded and memory -costs have plummeted. -However, as I learned from Matt Mackall's -bloatwatch -efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (CONFIG_PREEMPT=n) kernels, and thus -tiny RCU -was born. -Josh Triplett has since taken over the small-memory banner with his -Linux kernel tinification -project, which resulted in -SRCU -becoming optional for those kernels not needing it. - -

    -The remaining performance requirements are, for the most part, -unsurprising. -For example, in keeping with RCU's read-side specialization, -rcu_dereference() should have negligible overhead (for -example, suppression of a few minor compiler optimizations). -Similarly, in non-preemptible environments, rcu_read_lock() and -rcu_read_unlock() should have exactly zero overhead. - -

    -In preemptible environments, in the case where the RCU read-side -critical section was not preempted (as will be the case for the -highest-priority real-time process), rcu_read_lock() and -rcu_read_unlock() should have minimal overhead. -In particular, they should not contain atomic read-modify-write -operations, memory-barrier instructions, preemption disabling, -interrupt disabling, or backwards branches. -However, in the case where the RCU read-side critical section was preempted, -rcu_read_unlock() may acquire spinlocks and disable interrupts. -This is why it is better to nest an RCU read-side critical section -within a preempt-disable region than vice versa, at least in cases -where that critical section is short enough to avoid unduly degrading -real-time latencies. - -

    -The synchronize_rcu() grace-period-wait primitive is -optimized for throughput. -It may therefore incur several milliseconds of latency in addition to -the duration of the longest RCU read-side critical section. -On the other hand, multiple concurrent invocations of -synchronize_rcu() are required to use batching optimizations -so that they can be satisfied by a single underlying grace-period-wait -operation. -For example, in the Linux kernel, it is not unusual for a single -grace-period-wait operation to serve more than -1,000 separate invocations -of synchronize_rcu(), thus amortizing the per-invocation -overhead down to nearly zero. -However, the grace-period optimization is also required to avoid -measurable degradation of real-time scheduling and interrupt latencies. - -

    -In some cases, the multi-millisecond synchronize_rcu() -latencies are unacceptable. -In these cases, synchronize_rcu_expedited() may be used -instead, reducing the grace-period latency down to a few tens of -microseconds on small systems, at least in cases where the RCU read-side -critical sections are short. -There are currently no special latency requirements for -synchronize_rcu_expedited() on large systems, but, -consistent with the empirical nature of the RCU specification, -that is subject to change. -However, there most definitely are scalability requirements: -A storm of synchronize_rcu_expedited() invocations on 4096 -CPUs should at least make reasonable forward progress. -In return for its shorter latencies, synchronize_rcu_expedited() -is permitted to impose modest degradation of real-time latency -on non-idle online CPUs. -That said, it will likely be necessary to take further steps to reduce this -degradation, hopefully to roughly that of a scheduling-clock interrupt. - -

    -There are a number of situations where even -synchronize_rcu_expedited()'s reduced grace-period -latency is unacceptable. -In these situations, the asynchronous call_rcu() can be -used in place of synchronize_rcu() as follows: - -

    -
    - 1 struct foo {
    - 2   int a;
    - 3   int b;
    - 4   struct rcu_head rh;
    - 5 };
    - 6
    - 7 static void remove_gp_cb(struct rcu_head *rhp)
    - 8 {
    - 9   struct foo *p = container_of(rhp, struct foo, rh);
    -10
    -11   kfree(p);
    -12 }
    -13
    -14 bool remove_gp_asynchronous(void)
    -15 {
    -16   struct foo *p;
    -17
    -18   spin_lock(&gp_lock);
    -19   p = rcu_dereference(gp);
    -20   if (!p) {
    -21     spin_unlock(&gp_lock);
    -22     return false;
    -23   }
    -24   rcu_assign_pointer(gp, NULL);
    -25   call_rcu(&p->rh, remove_gp_cb);
    -26   spin_unlock(&gp_lock);
    -27   return true;
    -28 }
    -
    -
    - -

    -A definition of struct foo is finally needed, and appears -on lines 1-5. -The function remove_gp_cb() is passed to call_rcu() -on line 25, and will be invoked after the end of a subsequent -grace period. -This gets the same effect as remove_gp_synchronous(), -but without forcing the updater to wait for a grace period to elapse. -The call_rcu() function may be used in a number of -situations where neither synchronize_rcu() nor -synchronize_rcu_expedited() would be legal, -including within preempt-disable code, local_bh_disable() code, -interrupt-disable code, and interrupt handlers. -However, even call_rcu() is illegal within NMI handlers -and from offline CPUs. -The callback function (remove_gp_cb() in this case) will be -executed within softirq (software interrupt) environment within the -Linux kernel, -either within a real softirq handler or under the protection -of local_bh_disable(). -In both the Linux kernel and in userspace, it is bad practice to -write an RCU callback function that takes too long. -Long-running operations should be relegated to separate threads or -(in the Linux kernel) workqueues. - -

    @@QQ@@ -Why does line 19 use rcu_access_pointer()? -After all, call_rcu() on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that rcu_dereference() is required? -

    @@QQA@@ -Presumably the ->gp_lock acquired on line 18 excludes -any changes, including any insertions that rcu_dereference() -would protect against. -Therefore, any insertions will be delayed until after ->gp_lock -is released on line 25, which in turn means that -rcu_access_pointer() suffices. -

    @@QQE@@ - -

    -However, all that remove_gp_cb() is doing is -invoking kfree() on the data element. -This is a common idiom, and is supported by kfree_rcu(), -which allows “fire and forget” operation as shown below: - -

    -
    - 1 struct foo {
    - 2   int a;
    - 3   int b;
    - 4   struct rcu_head rh;
    - 5 };
    - 6
    - 7 bool remove_gp_faf(void)
    - 8 {
    - 9   struct foo *p;
    -10
    -11   spin_lock(&gp_lock);
    -12   p = rcu_dereference(gp);
    -13   if (!p) {
    -14     spin_unlock(&gp_lock);
    -15     return false;
    -16   }
    -17   rcu_assign_pointer(gp, NULL);
    -18   kfree_rcu(p, rh);
    -19   spin_unlock(&gp_lock);
    -20   return true;
    -21 }
    -
    -
    - -

    -Note that remove_gp_faf() simply invokes -kfree_rcu() and proceeds, without any need to pay any -further attention to the subsequent grace period and kfree(). -It is permissible to invoke kfree_rcu() from the same -environments as for call_rcu(). -Interestingly enough, DYNIX/ptx had the equivalents of -call_rcu() and kfree_rcu(), but not -synchronize_rcu(). -This was due to the fact that RCU was not heavily used within DYNIX/ptx, -so the very few places that needed something like -synchronize_rcu() simply open-coded it. - -

    @@QQ@@ -Earlier it was claimed that call_rcu() and -kfree_rcu() allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -

    @@QQA@@ -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -call_rcu() or kfree_rcu() can proceed to the -next update as soon as it has invoked call_rcu() or -kfree_rcu(), without having to wait for a subsequent -grace period. -

    @@QQE@@ - -

    -But what if the updater must wait for the completion of code to be -executed after the end of the grace period, but has other tasks -that can be carried out in the meantime? -The polling-style get_state_synchronize_rcu() and -cond_synchronize_rcu() functions may be used for this -purpose, as shown below: - -

    -
    - 1 bool remove_gp_poll(void)
    - 2 {
    - 3   struct foo *p;
    - 4   unsigned long s;
    - 5
    - 6   spin_lock(&gp_lock);
    - 7   p = rcu_access_pointer(gp);
    - 8   if (!p) {
    - 9     spin_unlock(&gp_lock);
    -10     return false;
    -11   }
    -12   rcu_assign_pointer(gp, NULL);
    -13   spin_unlock(&gp_lock);
    -14   s = get_state_synchronize_rcu();
    -15   do_something_while_waiting();
    -16   cond_synchronize_rcu(s);
    -17   kfree(p);
    -18   return true;
    -19 }
    -
    -
    - -

    -On line 14, get_state_synchronize_rcu() obtains a -“cookie” from RCU, -then line 15 carries out other tasks, -and finally, line 16 returns immediately if a grace period has -elapsed in the meantime, but otherwise waits as required. -The need for get_state_synchronize_rcu and -cond_synchronize_rcu() has appeared quite recently, -so it is too early to tell whether they will stand the test of time. - -

    -RCU thus provides a range of tools to allow updaters to strike the -required tradeoff between latency, flexibility and CPU overhead. - -

    Composability

    - -

    -Composability has received much attention in recent years, perhaps in part -due to the collision of multicore hardware with object-oriented techniques -designed in single-threaded environments for single-threaded use. -And in theory, RCU read-side critical sections may be composed, and in -fact may be nested arbitrarily deeply. -In practice, as with all real-world implementations of composable -constructs, there are limitations. - -

    -Implementations of RCU for which rcu_read_lock() -and rcu_read_unlock() generate no code, such as -Linux-kernel RCU when CONFIG_PREEMPT=n, can be -nested arbitrarily deeply. -After all, there is no overhead. -Except that if all these instances of rcu_read_lock() -and rcu_read_unlock() are visible to the compiler, -compilation will eventually fail due to exhausting memory, -mass storage, or user patience, whichever comes first. -If the nesting is not visible to the compiler, as is the case with -mutually recursive functions each in its own translation unit, -stack overflow will result. -If the nesting takes the form of loops, either the control variable -will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. -Nevertheless, this class of RCU implementations is one -of the most composable constructs in existence. - -

    -RCU implementations that explicitly track nesting depth -are limited by the nesting-depth counter. -For example, the Linux kernel's preemptible RCU limits nesting to -INT_MAX. -This should suffice for almost all practical purposes. -That said, a consecutive pair of RCU read-side critical sections -between which there is an operation that waits for a grace period -cannot be enclosed in another RCU read-side critical section. -This is because it is not legal to wait for a grace period within -an RCU read-side critical section: To do so would result either -in deadlock or -in RCU implicitly splitting the enclosing RCU read-side critical -section, neither of which is conducive to a long-lived and prosperous -kernel. - -

    -It is worth noting that RCU is not alone in limiting composability. -For example, many transactional-memory implementations prohibit -composing a pair of transactions separated by an irrevocable -operation (for example, a network receive operation). -For another example, lock-based critical sections can be composed -surprisingly freely, but only if deadlock is avoided. - -

    -In short, although RCU read-side critical sections are highly composable, -care is required in some situations, just as is the case for any other -composable synchronization mechanism. - -

    Corner Cases

    - -

    -A given RCU workload might have an endless and intense stream of -RCU read-side critical sections, perhaps even so intense that there -was never a point in time during which there was not at least one -RCU read-side critical section in flight. -RCU cannot allow this situation to block grace periods: As long as -all the RCU read-side critical sections are finite, grace periods -must also be finite. - -

    -That said, preemptible RCU implementations could potentially result -in RCU read-side critical sections being preempted for long durations, -which has the effect of creating a long-duration RCU read-side -critical section. -This situation can arise only in heavily loaded systems, but systems using -real-time priorities are of course more vulnerable. -Therefore, RCU priority boosting is provided to help deal with this -case. -That said, the exact requirements on RCU priority boosting will likely -evolve as more experience accumulates. - -

    -Other workloads might have very high update rates. -Although one can argue that such workloads should instead use -something other than RCU, the fact remains that RCU must -handle such workloads gracefully. -This requirement is another factor driving batching of grace periods, -but it is also the driving force behind the checks for large numbers -of queued RCU callbacks in the call_rcu() code path. -Finally, high update rates should not delay RCU read-side critical -sections, although some read-side delays can occur when using -synchronize_rcu_expedited(), courtesy of this function's use -of try_stop_cpus(). -(In the future, synchronize_rcu_expedited() will be -converted to use lighter-weight inter-processor interrupts (IPIs), -but this will still disturb readers, though to a much smaller degree.) - -

    -Although all three of these corner cases were understood in the early -1990s, a simple user-level test consisting of close(open(path)) -in a tight loop -in the early 2000s suddenly provided a much deeper appreciation of the -high-update-rate corner case. -This test also motivated addition of some RCU code to react to high update -rates, for example, if a given CPU finds itself with more than 10,000 -RCU callbacks queued, it will cause RCU to take evasive action by -more aggressively starting grace periods and more aggressively forcing -completion of grace-period processing. -This evasive action causes the grace period to complete more quickly, -but at the cost of restricting RCU's batching optimizations, thus -increasing the CPU overhead incurred by that grace period. - -

    -Software-Engineering Requirements

    - -

    -Between Murphy's Law and “To err is human”, it is necessary to -guard against mishaps and misuse: - -

      -
    1. It is all too easy to forget to use rcu_read_lock() - everywhere that it is needed, so kernels built with - CONFIG_PROVE_RCU=y will spat if - rcu_dereference() is used outside of an - RCU read-side critical section. - Update-side code can use rcu_dereference_protected(), - which takes a - lockdep expression - to indicate what is providing the protection. - If the indicated protection is not provided, a lockdep splat - is emitted. - -

      - Code shared between readers and updaters can use - rcu_dereference_check(), which also takes a - lockdep expression, and emits a lockdep splat if neither - rcu_read_lock() nor the indicated protection - is in place. - In addition, rcu_dereference_raw() is used in those - (hopefully rare) cases where the required protection cannot - be easily described. - Finally, rcu_read_lock_held() is provided to - allow a function to verify that it has been invoked within - an RCU read-side critical section. - I was made aware of this set of requirements shortly after Thomas - Gleixner audited a number of RCU uses. -

    2. A given function might wish to check for RCU-related preconditions - upon entry, before using any other RCU API. - The rcu_lockdep_assert() does this job, - asserting the expression in kernels having lockdep enabled - and doing nothing otherwise. -
    3. It is also easy to forget to use rcu_assign_pointer() - and rcu_dereference(), perhaps (incorrectly) - substituting a simple assignment. - To catch this sort of error, a given RCU-protected pointer may be - tagged with __rcu, after which running sparse - with CONFIG_SPARSE_RCU_POINTER=y will complain - about simple-assignment accesses to that pointer. - Arnd Bergmann made me aware of this requirement, and also - supplied the needed - patch series. -
    4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y - will splat if a data element is passed to call_rcu() - twice in a row, without a grace period in between. - (This error is similar to a double free.) - The corresponding rcu_head structures that are - dynamically allocated are automatically tracked, but - rcu_head structures allocated on the stack - must be initialized with init_rcu_head_on_stack() - and cleaned up with destroy_rcu_head_on_stack(). - Similarly, statically allocated non-stack rcu_head - structures must be initialized with init_rcu_head() - and cleaned up with destroy_rcu_head(). - Mathieu Desnoyers made me aware of this requirement, and also - supplied the needed - patch. -
    5. An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat, with - the duration of “eventually” being controlled by the - RCU_CPU_STALL_TIMEOUT Kconfig option, or, - alternatively, by the - rcupdate.rcu_cpu_stall_timeout boot/sysfs - parameter. - However, RCU is not obligated to produce this splat - unless there is a grace period waiting on that particular - RCU read-side critical section. -

      - Some extreme workloads might intentionally delay - RCU grace periods, and systems running those workloads can - be booted with rcupdate.rcu_cpu_stall_suppress - to suppress the splats. - This kernel parameter may also be set via sysfs. - Furthermore, RCU CPU stall warnings are counter-productive - during sysrq dumps and during panics. - RCU therefore supplies the rcu_sysrq_start() and - rcu_sysrq_end() API members to be called before - and after long sysrq dumps. - RCU also supplies the rcu_panic() notifier that is - automatically invoked at the beginning of a panic to suppress - further RCU CPU stall warnings. - -

      - This requirement made itself known in the early 1990s, pretty - much the first time that it was necessary to debug a CPU stall. - That said, the initial implementation in DYNIX/ptx was quite - generic in comparison with that of Linux. -

    6. Although it would be very good to detect pointers leaking out - of RCU read-side critical sections, there is currently no - good way of doing this. - One complication is the need to distinguish between pointers - leaking and pointers that have been handed off from RCU to - some other synchronization mechanism, for example, reference - counting. -
    7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related - information is provided via both debugfs and event tracing. -
    8. Open-coded use of rcu_assign_pointer() and - rcu_dereference() to create typical linked - data structures can be surprisingly error-prone. - Therefore, RCU-protected - linked lists - and, more recently, RCU-protected - hash tables - are available. - Many other special-purpose RCU-protected data structures are - available in the Linux kernel and the userspace RCU library. -
    9. Some linked structures are created at compile time, but still - require __rcu checking. - The RCU_POINTER_INITIALIZER() macro serves this - purpose. -
    10. It is not necessary to use rcu_assign_pointer() - when creating linked structures that are to be published via - a single external pointer. - The RCU_INIT_POINTER() macro is provided for - this task and also for assigning NULL pointers - at runtime. -
    - -

    -This not a hard-and-fast list: RCU's diagnostic capabilities will -continue to be guided by the number and type of usage bugs found -in real-world RCU usage. - -

    Linux Kernel Complications

    - -

    -The Linux kernel provides an interesting environment for all kinds of -software, including RCU. -Some of the relevant points of interest are as follows: - -

      -
    1. Configuration. -
    2. Firmware Interface. -
    3. Early Boot. -
    4. - Interrupts and non-maskable interrupts (NMIs). -
    5. Loadable Modules. -
    6. Hotplug CPU. -
    7. Scheduler and RCU. -
    8. Tracing and RCU. -
    9. Energy Efficiency. -
    10. Memory Efficiency. -
    11. - Performance, Scalability, Response Time, and Reliability. -
    - -

    -This list is probably incomplete, but it does give a feel for the -most notable Linux-kernel complications. -Each of the following sections covers one of the above topics. - -

    Configuration

    - -

    -RCU's goal is automatic configuration, so that almost nobody -needs to worry about RCU's Kconfig options. -And for almost all users, RCU does in fact work well -“out of the box.” - -

    -However, there are specialized use cases that are handled by -kernel boot parameters and Kconfig options. -Unfortunately, the Kconfig system will explicitly ask users -about new Kconfig options, which requires almost all of them -be hidden behind a CONFIG_RCU_EXPERT Kconfig option. - -

    -This all should be quite obvious, but the fact remains that -Linus Torvalds recently had to -remind -me of this requirement. - -

    Firmware Interface

    - -

    -In many cases, kernel obtains information about the system from the -firmware, and sometimes things are lost in translation. -Or the translation is accurate, but the original message is bogus. - -

    -For example, some systems' firmware overreports the number of CPUs, -sometimes by a large factor. -If RCU naively believed the firmware, as it used to do, -it would create too many per-CPU kthreads. -Although the resulting system will still run correctly, the extra -kthreads needlessly consume memory and can cause confusion -when they show up in ps listings. - -

    -RCU must therefore wait for a given CPU to actually come online before -it can allow itself to believe that the CPU actually exists. -The resulting “ghost CPUs” (which are never going to -come online) cause a number of -interesting complications. - -

    Early Boot

    - -

    -The Linux kernel's boot sequence is an interesting process, -and RCU is used early, even before rcu_init() -is invoked. -In fact, a number of RCU's primitives can be used as soon as the -initial task's task_struct is available and the -boot CPU's per-CPU variables are set up. -The read-side primitives (rcu_read_lock(), -rcu_read_unlock(), rcu_dereference(), -and rcu_access_pointer()) will operate normally very early on, -as will rcu_assign_pointer(). - -

    -Although call_rcu() may be invoked at any -time during boot, callbacks are not guaranteed to be invoked until after -the scheduler is fully up and running. -This delay in callback invocation is due to the fact that RCU does not -invoke callbacks until it is fully initialized, and this full initialization -cannot occur until after the scheduler has initialized itself to the -point where RCU can spawn and run its kthreads. -In theory, it would be possible to invoke callbacks earlier, -however, this is not a panacea because there would be severe restrictions -on what operations those callbacks could invoke. - -

    -Perhaps surprisingly, synchronize_rcu(), -synchronize_rcu_bh() -(discussed below), -and -synchronize_sched() -will all operate normally -during very early boot, the reason being that there is only one CPU -and preemption is disabled. -This means that the call synchronize_rcu() (or friends) -itself is a quiescent -state and thus a grace period, so the early-boot implementation can -be a no-op. - -

    -Both synchronize_rcu_bh() and synchronize_sched() -continue to operate normally through the remainder of boot, courtesy -of the fact that preemption is disabled across their RCU read-side -critical sections and also courtesy of the fact that there is still -only one CPU. -However, once the scheduler starts initializing, preemption is enabled. -There is still only a single CPU, but the fact that preemption is enabled -means that the no-op implementation of synchronize_rcu() no -longer works in CONFIG_PREEMPT=y kernels. -Therefore, as soon as the scheduler starts initializing, the early-boot -fastpath is disabled. -This means that synchronize_rcu() switches to its runtime -mode of operation where it posts callbacks, which in turn means that -any call to synchronize_rcu() will block until the corresponding -callback is invoked. -Unfortunately, the callback cannot be invoked until RCU's runtime -grace-period machinery is up and running, which cannot happen until -the scheduler has initialized itself sufficiently to allow RCU's -kthreads to be spawned. -Therefore, invoking synchronize_rcu() during scheduler -initialization can result in deadlock. - -

    @@QQ@@ -So what happens with synchronize_rcu() during -scheduler initialization for CONFIG_PREEMPT=n -kernels? -

    @@QQA@@ -In CONFIG_PREEMPT=n kernel, synchronize_rcu() -maps directly to synchronize_sched(). -Therefore, synchronize_rcu() works normally throughout -boot in CONFIG_PREEMPT=n kernels. -However, your code must also work in CONFIG_PREEMPT=y kernels, -so it is still necessary to avoid invoking synchronize_rcu() -during scheduler initialization. -

    @@QQE@@ - -

    -I learned of these boot-time requirements as a result of a series of -system hangs. - -

    Interrupts and NMIs

    - -

    -The Linux kernel has interrupts, and RCU read-side critical sections are -legal within interrupt handlers and within interrupt-disabled regions -of code, as are invocations of call_rcu(). - -

    -Some Linux-kernel architectures can enter an interrupt handler from -non-idle process context, and then just never leave it, instead stealthily -transitioning back to process context. -This trick is sometimes used to invoke system calls from inside the kernel. -These “half-interrupts” mean that RCU has to be very careful -about how it counts interrupt nesting levels. -I learned of this requirement the hard way during a rewrite -of RCU's dyntick-idle code. - -

    -The Linux kernel has non-maskable interrupts (NMIs), and -RCU read-side critical sections are legal within NMI handlers. -Thankfully, RCU update-side primitives, including -call_rcu(), are prohibited within NMI handlers. - -

    -The name notwithstanding, some Linux-kernel architectures -can have nested NMIs, which RCU must handle correctly. -Andy Lutomirski -surprised me -with this requirement; -he also kindly surprised me with -an algorithm -that meets this requirement. - -

    Loadable Modules

    - -

    -The Linux kernel has loadable modules, and these modules can -also be unloaded. -After a given module has been unloaded, any attempt to call -one of its functions results in a segmentation fault. -The module-unload functions must therefore cancel any -delayed calls to loadable-module functions, for example, -any outstanding mod_timer() must be dealt with -via del_timer_sync() or similar. - -

    -Unfortunately, there is no way to cancel an RCU callback; -once you invoke call_rcu(), the callback function is -going to eventually be invoked, unless the system goes down first. -Because it is normally considered socially irresponsible to crash the system -in response to a module unload request, we need some other way -to deal with in-flight RCU callbacks. - -

    -RCU therefore provides -rcu_barrier(), -which waits until all in-flight RCU callbacks have been invoked. -If a module uses call_rcu(), its exit function should therefore -prevent any future invocation of call_rcu(), then invoke -rcu_barrier(). -In theory, the underlying module-unload code could invoke -rcu_barrier() unconditionally, but in practice this would -incur unacceptable latencies. - -

    -Nikita Danilov noted this requirement for an analogous filesystem-unmount -situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. -The need for rcu_barrier() for module unloading became -apparent later. - -

    Hotplug CPU

    - -

    -The Linux kernel supports CPU hotplug, which means that CPUs -can come and go. -It is of course illegal to use any RCU API member from an offline CPU. -This requirement was present from day one in DYNIX/ptx, but -on the other hand, the Linux kernel's CPU-hotplug implementation -is “interesting.” - -

    -The Linux-kernel CPU-hotplug implementation has notifiers that -are used to allow the various kernel subsystems (including RCU) -to respond appropriately to a given CPU-hotplug operation. -Most RCU operations may be invoked from CPU-hotplug notifiers, -including even normal synchronous grace-period operations -such as synchronize_rcu(). -However, expedited grace-period operations such as -synchronize_rcu_expedited() are not supported, -due to the fact that current implementations block CPU-hotplug -operations, which could result in deadlock. - -

    -In addition, all-callback-wait operations such as -rcu_barrier() are also not supported, due to the -fact that there are phases of CPU-hotplug operations where -the outgoing CPU's callbacks will not be invoked until after -the CPU-hotplug operation ends, which could also result in deadlock. - -

    Scheduler and RCU

    - -

    -RCU depends on the scheduler, and the scheduler uses RCU to -protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -rcu_read_unlock_special() while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -

    -Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a rcu_read_unlock() to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -

    -For RCU's part, the preemptible-RCU rcu_read_unlock() -implementation must be written carefully to avoid similar deadlocks. -In particular, rcu_read_unlock() must tolerate an -interrupt where the interrupt handler invokes both -rcu_read_lock() and rcu_read_unlock(). -This possibility requires rcu_read_unlock() to use -negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -

    -This pair of mutual scheduler-RCU requirements came as a -complete surprise. - -

    -As noted above, RCU makes use of kthreads, and it is necessary to -avoid excessive CPU-time accumulation by these kthreads. -This requirement was no surprise, but RCU's violation of it -when running context-switch-heavy workloads when built with -CONFIG_NO_HZ_FULL=y -did come as a surprise [PDF]. -RCU has made good progress towards meeting this requirement, even -for context-switch-have CONFIG_NO_HZ_FULL=y workloads, -but there is room for further improvement. - -

    Tracing and RCU

    - -

    -It is possible to use tracing on RCU code, but tracing itself -uses RCU. -For this reason, rcu_dereference_raw_notrace() -is provided for use by tracing, which avoids the destructive -recursion that could otherwise ensue. -This API is also used by virtualization in some architectures, -where RCU readers execute in environments in which tracing -cannot be used. -The tracing folks both located the requirement and provided the -needed fix, so this surprise requirement was relatively painless. - -

    Energy Efficiency

    - -

    -Interrupting idle CPUs is considered socially unacceptable, -especially by people with battery-powered embedded systems. -RCU therefore conserves energy by detecting which CPUs are -idle, including tracking CPUs that have been interrupted from idle. -This is a large part of the energy-efficiency requirement, -so I learned of this via an irate phone call. - -

    -Because RCU avoids interrupting idle CPUs, it is illegal to -execute an RCU read-side critical section on an idle CPU. -(Kernels built with CONFIG_PROVE_RCU=y will splat -if you try it.) -The RCU_NONIDLE() macro and _rcuidle -event tracing is provided to work around this restriction. -In addition, rcu_is_watching() may be used to -test whether or not it is currently legal to run RCU read-side -critical sections on this CPU. -I learned of the need for diagnostics on the one hand -and RCU_NONIDLE() on the other while inspecting -idle-loop code. -Steven Rostedt supplied _rcuidle event tracing, -which is used quite heavily in the idle loop. - -

    -It is similarly socially unacceptable to interrupt an -nohz_full CPU running in userspace. -RCU must therefore track nohz_full userspace -execution. -And in -CONFIG_NO_HZ_FULL_SYSIDLE=y -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in -time, and be able to determine whether or not some other CPU spent -any time idle and/or executing in userspace. - -

    -These energy-efficiency requirements have proven quite difficult to -understand and to meet, for example, there have been more than five -clean-sheet rewrites of RCU's energy-efficiency code, the last of -which was finally able to demonstrate -real energy savings running on real hardware [PDF]. -As noted earlier, -I learned of many of these requirements via angry phone calls: -Flaming me on the Linux-kernel mailing list was apparently not -sufficient to fully vent their ire at RCU's energy-efficiency bugs! - -

    Memory Efficiency

    - -

    -Although small-memory non-realtime systems can simply use Tiny RCU, -code size is only one aspect of memory efficiency. -Another aspect is the size of the rcu_head structure -used by call_rcu() and kfree_rcu(). -Although this structure contains nothing more than a pair of pointers, -it does appear in many RCU-protected data structures, including -some that are size critical. -The page structure is a case in point, as evidenced by -the many occurrences of the union keyword within that structure. - -

    -This need for memory efficiency is one reason that RCU uses hand-crafted -singly linked lists to track the rcu_head structures that -are waiting for a grace period to elapse. -It is also the reason why rcu_head structures do not contain -debug information, such as fields tracking the file and line of the -call_rcu() or kfree_rcu() that posted them. -Although this information might appear in debug-only kernel builds at some -point, in the meantime, the ->func field will often provide -the needed debug information. - -

    -However, in some cases, the need for memory efficiency leads to even -more extreme measures. -Returning to the page structure, the rcu_head field -shares storage with a great many other structures that are used at -various points in the corresponding page's lifetime. -In order to correctly resolve certain -race conditions, -the Linux kernel's memory-management subsystem needs a particular bit -to remain zero during all phases of grace-period processing, -and that bit happens to map to the bottom bit of the -rcu_head structure's ->next field. -RCU makes this guarantee as long as call_rcu() -is used to post the callback, as opposed to kfree_rcu() -or some future “lazy” -variant of call_rcu() that might one day be created for -energy-efficiency purposes. - -

    -Performance, Scalability, Response Time, and Reliability

    - -

    -Expanding on the -earlier discussion, -RCU is used heavily by hot code paths in performance-critical -portions of the Linux kernel's networking, security, virtualization, -and scheduling code paths. -RCU must therefore use efficient implementations, especially in its -read-side primitives. -To that end, it would be good if preemptible RCU's implementation -of rcu_read_lock() could be inlined, however, doing -this requires resolving #include issues with the -task_struct structure. - -

    -The Linux kernel supports hardware configurations with up to -4096 CPUs, which means that RCU must be extremely scalable. -Algorithms that involve frequent acquisitions of global locks or -frequent atomic operations on global variables simply cannot be -tolerated within the RCU implementation. -RCU therefore makes heavy use of a combining tree based on the -rcu_node structure. -RCU is required to tolerate all CPUs continuously invoking any -combination of RCU's runtime primitives with minimal per-operation -overhead. -In fact, in many cases, increasing load must decrease the -per-operation overhead, witness the batching optimizations for -synchronize_rcu(), call_rcu(), -synchronize_rcu_expedited(), and rcu_barrier(). -As a general rule, RCU must cheerfully accept whatever the -rest of the Linux kernel decides to throw at it. - -

    -The Linux kernel is used for real-time workloads, especially -in conjunction with the --rt patchset. -The real-time-latency response requirements are such that the -traditional approach of disabling preemption across RCU -read-side critical sections is inappropriate. -Kernels built with CONFIG_PREEMPT=y therefore -use an RCU implementation that allows RCU read-side critical -sections to be preempted. -This requirement made its presence known after users made it -clear that an earlier -real-time patch -did not meet their needs, in conjunction with some -RCU issues -encountered by a very early version of the -rt patchset. - -

    -In addition, RCU must make do with a sub-100-microsecond real-time latency -budget. -In fact, on smaller systems with the -rt patchset, the Linux kernel -provides sub-20-microsecond real-time latencies for the whole kernel, -including RCU. -RCU's scalability and latency must therefore be sufficient for -these sorts of configurations. -To my surprise, the sub-100-microsecond real-time latency budget - -applies to even the largest systems [PDF], -up to and including systems with 4096 CPUs. -This real-time requirement motivated the grace-period kthread, which -also simplified handling of a number of race conditions. - -

    -RCU must avoid degrading real-time response for CPU-bound threads, whether -executing in usermode (which is one use case for -CONFIG_NO_HZ_FULL=y) or in the kernel. -That said, CPU-bound loops in the kernel must execute -cond_resched_rcu_qs() at least once per few tens of milliseconds -in order to avoid receiving an IPI from RCU. - -

    -Finally, RCU's status as a synchronization primitive means that -any RCU failure can result in arbitrary memory corruption that can be -extremely difficult to debug. -This means that RCU must be extremely reliable, which in -practice also means that RCU must have an aggressive stress-test -suite. -This stress-test suite is called rcutorture. - -

    -Although the need for rcutorture was no surprise, -the current immense popularity of the Linux kernel is posing -interesting—and perhaps unprecedented—validation -challenges. -To see this, keep in mind that there are well over one billion -instances of the Linux kernel running today, given Android -smartphones, Linux-powered televisions, and servers. -This number can be expected to increase sharply with the advent of -the celebrated Internet of Things. - -

    -Suppose that RCU contains a race condition that manifests on average -once per million years of runtime. -This bug will be occurring about three times per day across -the installed base. -RCU could simply hide behind hardware error rates, given that no one -should really expect their smartphone to last for a million years. -However, anyone taking too much comfort from this thought should -consider the fact that in most jurisdictions, a successful multi-year -test of a given mechanism, which might include a Linux kernel, -suffices for a number of types of safety-critical certifications. -In fact, rumor has it that the Linux kernel is already being used -in production for safety-critical applications. -I don't know about you, but I would feel quite bad if a bug in RCU -killed someone. -Which might explain my recent focus on validation and verification. - -

    Other RCU Flavors

    - -

    -One of the more surprising things about RCU is that there are now -no fewer than five flavors, or API families. -In addition, the primary flavor that has been the sole focus up to -this point has two different implementations, non-preemptible and -preemptible. -The other four flavors are listed below, with requirements for each -described in a separate section. - -

      -
    1. Bottom-Half Flavor -
    2. Sched Flavor -
    3. Sleepable RCU -
    4. Tasks RCU -
    5. - Waiting for Multiple Grace Periods -
    - -

    Bottom-Half Flavor

    - -

    -The softirq-disable (AKA “bottom-half”, -hence the “_bh” abbreviations) -flavor of RCU, or RCU-bh, was developed by -Dipankar Sarma to provide a flavor of RCU that could withstand the -network-based denial-of-service attacks researched by Robert -Olsson. -These attacks placed so much networking load on the system -that some of the CPUs never exited softirq execution, -which in turn prevented those CPUs from ever executing a context switch, -which, in the RCU implementation of that time, prevented grace periods -from ever ending. -The result was an out-of-memory condition and a system hang. - -

    -The solution was the creation of RCU-bh, which does -local_bh_disable() -across its read-side critical sections, and which uses the transition -from one type of softirq processing to another as a quiescent state -in addition to context switch, idle, user mode, and offline. -This means that RCU-bh grace periods can complete even when some of -the CPUs execute in softirq indefinitely, thus allowing algorithms -based on RCU-bh to withstand network-based denial-of-service attacks. - -

    -Because -rcu_read_lock_bh() and rcu_read_unlock_bh() -disable and re-enable softirq handlers, any attempt to start a softirq -handlers during the -RCU-bh read-side critical section will be deferred. -In this case, rcu_read_unlock_bh() -will invoke softirq processing, which can take considerable time. -One can of course argue that this softirq overhead should be associated -with the code following the RCU-bh read-side critical section rather -than rcu_read_unlock_bh(), but the fact -is that most profiling tools cannot be expected to make this sort -of fine distinction. -For example, suppose that a three-millisecond-long RCU-bh read-side -critical section executes during a time of heavy networking load. -There will very likely be an attempt to invoke at least one softirq -handler during that three milliseconds, but any such invocation will -be delayed until the time of the rcu_read_unlock_bh(). -This can of course make it appear at first glance as if -rcu_read_unlock_bh() was executing very slowly. - -

    -The -RCU-bh API -includes -rcu_read_lock_bh(), -rcu_read_unlock_bh(), -rcu_dereference_bh(), -rcu_dereference_bh_check(), -synchronize_rcu_bh(), -synchronize_rcu_bh_expedited(), -call_rcu_bh(), -rcu_barrier_bh(), and -rcu_read_lock_bh_held(). - -

    Sched Flavor

    - -

    -Before preemptible RCU, waiting for an RCU grace period had the -side effect of also waiting for all pre-existing interrupt -and NMI handlers. -However, there are legitimate preemptible-RCU implementations that -do not have this property, given that any point in the code outside -of an RCU read-side critical section can be a quiescent state. -Therefore, RCU-sched was created, which follows “classic” -RCU in that an RCU-sched grace period waits for for pre-existing -interrupt and NMI handlers. -In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched -APIs have identical implementations, while kernels built with -CONFIG_PREEMPT=y provide a separate implementation for each. - -

    -Note well that in CONFIG_PREEMPT=y kernels, -rcu_read_lock_sched() and rcu_read_unlock_sched() -disable and re-enable preemption, respectively. -This means that if there was a preemption attempt during the -RCU-sched read-side critical section, rcu_read_unlock_sched() -will enter the scheduler, with all the latency and overhead entailed. -Just as with rcu_read_unlock_bh(), this can make it look -as if rcu_read_unlock_sched() was executing very slowly. -However, the highest-priority task won't be preempted, so that task -will enjoy low-overhead rcu_read_unlock_sched() invocations. - -

    -The -RCU-sched API -includes -rcu_read_lock_sched(), -rcu_read_unlock_sched(), -rcu_read_lock_sched_notrace(), -rcu_read_unlock_sched_notrace(), -rcu_dereference_sched(), -rcu_dereference_sched_check(), -synchronize_sched(), -synchronize_rcu_sched_expedited(), -call_rcu_sched(), -rcu_barrier_sched(), and -rcu_read_lock_sched_held(). -However, anything that disables preemption also marks an RCU-sched -read-side critical section, including -preempt_disable() and preempt_enable(), -local_irq_save() and local_irq_restore(), -and so on. - -

    Sleepable RCU

    - -

    -For well over a decade, someone saying “I need to block within -an RCU read-side critical section” was a reliable indication -that this someone did not understand RCU. -After all, if you are always blocking in an RCU read-side critical -section, you can probably afford to use a higher-overhead synchronization -mechanism. -However, that changed with the advent of the Linux kernel's notifiers, -whose RCU read-side critical -sections almost never sleep, but sometimes need to. -This resulted in the introduction of -sleepable RCU, -or SRCU. - -

    -SRCU allows different domains to be defined, with each such domain -defined by an instance of an srcu_struct structure. -A pointer to this structure must be passed in to each SRCU function, -for example, synchronize_srcu(&ss), where -ss is the srcu_struct structure. -The key benefit of these domains is that a slow SRCU reader in one -domain does not delay an SRCU grace period in some other domain. -That said, one consequence of these domains is that read-side code -must pass a “cookie” from srcu_read_lock() -to srcu_read_unlock(), for example, as follows: - -

    -
    - 1 int idx;
    - 2
    - 3 idx = srcu_read_lock(&ss);
    - 4 do_something();
    - 5 srcu_read_unlock(&ss, idx);
    -
    -
    - -

    -As noted above, it is legal to block within SRCU read-side critical sections, -however, with great power comes great responsibility. -If you block forever in one of a given domain's SRCU read-side critical -sections, then that domain's grace periods will also be blocked forever. -Of course, one good way to block forever is to deadlock, which can -happen if any operation in a given domain's SRCU read-side critical -section can block waiting, either directly or indirectly, for that domain's -grace period to elapse. -For example, this results in a self-deadlock: - -

    -
    - 1 int idx;
    - 2
    - 3 idx = srcu_read_lock(&ss);
    - 4 do_something();
    - 5 synchronize_srcu(&ss);
    - 6 srcu_read_unlock(&ss, idx);
    -
    -
    - -

    -However, if line 5 acquired a mutex that was held across -a synchronize_srcu() for domain ss, -deadlock would still be possible. -Furthermore, if line 5 acquired a mutex that was held across -a synchronize_srcu() for some other domain ss1, -and if an ss1-domain SRCU read-side critical section -acquired another mutex that was held across as ss-domain -synchronize_srcu(), -deadlock would again be possible. -Such a deadlock cycle could extend across an arbitrarily large number -of different SRCU domains. -Again, with great power comes great responsibility. - -

    -Unlike the other RCU flavors, SRCU read-side critical sections can -run on idle and even offline CPUs. -This ability requires that srcu_read_lock() and -srcu_read_unlock() contain memory barriers, which means -that SRCU readers will run a bit slower than would RCU readers. -It also motivates the smp_mb__after_srcu_read_unlock() -API, which, in combination with srcu_read_unlock(), -guarantees a full memory barrier. - -

    -The -SRCU API -includes -srcu_read_lock(), -srcu_read_unlock(), -srcu_dereference(), -srcu_dereference_check(), -synchronize_srcu(), -synchronize_srcu_expedited(), -call_srcu(), -srcu_barrier(), and -srcu_read_lock_held(). -It also includes -DEFINE_SRCU(), -DEFINE_STATIC_SRCU(), and -init_srcu_struct() -APIs for defining and initializing srcu_struct structures. - -

    Tasks RCU

    - -

    -Some forms of tracing use “tramopolines” to handle the -binary rewriting required to install different types of probes. -It would be good to be able to free old trampolines, which sounds -like a job for some form of RCU. -However, because it is necessary to be able to install a trace -anywhere in the code, it is not possible to use read-side markers -such as rcu_read_lock() and rcu_read_unlock(). -In addition, it does not work to have these markers in the trampoline -itself, because there would need to be instructions following -rcu_read_unlock(). -Although synchronize_rcu() would guarantee that execution -reached the rcu_read_unlock(), it would not be able to -guarantee that execution had completely left the trampoline. - -

    -The solution, in the form of -Tasks RCU, -is to have implicit -read-side critical sections that are delimited by voluntary context -switches, that is, calls to schedule(), -cond_resched_rcu_qs(), and -synchronize_rcu_tasks(). -In addition, transitions to and from userspace execution also delimit -tasks-RCU read-side critical sections. - -

    -The tasks-RCU API is quite compact, consisting only of -call_rcu_tasks(), -synchronize_rcu_tasks(), and -rcu_barrier_tasks(). - -

    -Waiting for Multiple Grace Periods

    - -

    -Perhaps you have an RCU protected data structure that is accessed from -RCU read-side critical sections, from softirq handlers, and from -hardware interrupt handlers. -That is three flavors of RCU, the normal flavor, the bottom-half flavor, -and the sched flavor. -How to wait for a compound grace period? - -

    -The best approach is usually to “just say no!” and -insert rcu_read_lock() and rcu_read_unlock() -around each RCU read-side critical section, regardless of what -environment it happens to be in. -But suppose that some of the RCU read-side critical sections are -on extremely hot code paths, and that use of CONFIG_PREEMPT=n -is not a viable option, so that rcu_read_lock() and -rcu_read_unlock() are not free. -What then? - -

    -You could wait on all three grace periods in succession, as follows: - -

    -
    - 1 synchronize_rcu();
    - 2 synchronize_rcu_bh();
    - 3 synchronize_sched();
    -
    -
    - -

    -This works, but triples the update-side latency penalty. -In cases where this is not acceptable, synchronize_rcu_mult() -may be used to wait on all three flavors of grace period concurrently: - -

    -
    - 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
    -
    -
    - -

    -But what if it is necessary to also wait on SRCU? -This can be done as follows: - -

    -
    - 1 static void call_my_srcu(struct rcu_head *head,
    - 2        void (*func)(struct rcu_head *head))
    - 3 {
    - 4   call_srcu(&my_srcu, head, func);
    - 5 }
    - 6
    - 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
    -
    -
    - -

    -If you needed to wait on multiple different flavors of SRCU -(but why???), you would need to create a wrapper function resembling -call_my_srcu() for each SRCU flavor. - -

    @@QQ@@ -But what if I need to wait for multiple RCU flavors, but I also need -the grace periods to be expedited? -

    @@QQA@@ -If you are using expedited grace periods, there should be less penalty -for waiting on them in succession. -But if that is nevertheless a problem, you can use workqueues or multiple -kthreads to wait on the various expedited grace periods concurrently. -

    @@QQE@@ - -

    -Again, it is usually better to adjust the RCU read-side critical sections -to use a single flavor of RCU, but when this is not feasible, you can use -synchronize_rcu_mult(). - -

    Possible Future Changes

    - -

    -One of the tricks that RCU uses to attain update-side scalability is -to increase grace-period latency with increasing numbers of CPUs. -If this becomes a serious problem, it will be necessary to rework the -grace-period state machine so as to avoid the need for the additional -latency. - -

    -Expedited grace periods scan the CPUs, so their latency and overhead -increases with increasing numbers of CPUs. -If this becomes a serious problem on large systems, it will be necessary -to do some redesign to avoid this scalability problem. - -

    -RCU disables CPU hotplug in a few places, perhaps most notably in the -expedited grace-period and rcu_barrier() operations. -If there is a strong reason to use expedited grace periods in CPU-hotplug -notifiers, it will be necessary to avoid disabling CPU hotplug. -This would introduce some complexity, so there had better be a very -good reason. - -

    -The tradeoff between grace-period latency on the one hand and interruptions -of other CPUs on the other hand may need to be re-examined. -The desire is of course for zero grace-period latency as well as zero -interprocessor interrupts undertaken during an expedited grace period -operation. -While this ideal is unlikely to be achievable, it is quite possible that -further improvements can be made. - -

    -The multiprocessor implementations of RCU use a combining tree that -groups CPUs so as to reduce lock contention and increase cache locality. -However, this combining tree does not spread its memory across NUMA -nodes nor does it align the CPU groups with hardware features such -as sockets or cores. -Such spreading and alignment is currently believed to be unnecessary -because the hotpath read-side primitives do not access the combining -tree, nor does call_rcu() in the common case. -If you believe that your architecture needs such spreading and alignment, -then your architecture should also benefit from the -rcutree.rcu_fanout_leaf boot parameter, which can be set -to the number of CPUs in a socket, NUMA node, or whatever. -If the number of CPUs is too large, use a fraction of the number of -CPUs. -If the number of CPUs is a large prime number, well, that certainly -is an “interesting” architectural choice! -More flexible arrangements might be considered, but only if -rcutree.rcu_fanout_leaf has proven inadequate, and only -if the inadequacy has been demonstrated by a carefully run and -realistic system-level workload. - -

    -Please note that arrangements that require RCU to remap CPU numbers will -require extremely good demonstration of need and full exploration of -alternatives. - -

    -There is an embarrassingly large number of flavors of RCU, and this -number has been increasing over time. -Perhaps it will be possible to combine some at some future date. - -

    -RCU's various kthreads are reasonably recent additions. -It is quite likely that adjustments will be required to more gracefully -handle extreme loads. -It might also be necessary to be able to relate CPU utilization by -RCU's kthreads and softirq handlers to the code that instigated this -CPU utilization. -For example, RCU callback overhead might be charged back to the -originating call_rcu() instance, though probably not -in production kernels. - -

    Summary

    - -

    -This document has presented more than two decade's worth of RCU -requirements. -Given that the requirements keep changing, this will not be the last -word on this subject, but at least it serves to get an important -subset of the requirements set forth. - -

    Acknowledgments

    - -I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, -Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and -Andy Lutomirski for their help in rendering -this article human readable, and to Michelle Rankin for her support -of this effort. -Other contributions are acknowledged in the Linux kernel's git archive. -The cartoon is copyright (c) 2013 by Melissa Broussard, -and is provided -under the terms of the Creative Commons Attribution-Share Alike 3.0 -United States license. - -

    @@QQAL@@ - - diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh deleted file mode 100755 index d354f069559b8..0000000000000 --- a/Documentation/RCU/Design/htmlqqz.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/sh -# -# Usage: sh htmlqqz.sh file -# -# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. -# Commands, all of which must be on a line by themselves: -# -# "

    @@QQ@@": Start of a quick quiz. -# "

    @@QQA@@": Start of a quick-quiz answer. -# "

    @@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. -# "

    @@QQAL@@": Place to put quick-quiz answer list. -# -# Places the result in file.html. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# -# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. - -fn=$1 -if test ! -r $fn.htmlx -then - echo "Error: $fn.htmlx unreadable." - exit 1 -fi - -echo "" > $fn.html -echo "" >> $fn.html -awk < $fn.htmlx >> $fn.html ' - -state == "" && $1 != "

    @@QQ@@" && $1 != "

    @@QQAL@@" { - print $0; - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR " (expected

    @@QQ@@ or

    @@QQAL@@)." > "/dev/stderr" - next; -} - -state == "" && $1 == "

    @@QQ@@" { - qqn++; - qqlineno = NR; - haveqq = 1; - state = "qq"; - print "

    Quick Quiz " qqn ":" - next; -} - -state == "qq" && $1 != "

    @@QQA@@" { - qq[qqn] = qq[qqn] $0 "\n"; - print $0 - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR ". (expected

    @@QQA@@)" > "/dev/stderr" - next; -} - -state == "qq" && $1 == "

    @@QQA@@" { - state = "qqa"; - print "
    Answer" - next; -} - -state == "qqa" && $1 != "

    @@QQE@@" { - qqa[qqn] = qqa[qqn] $0 "\n"; - if ($0 ~ /^

    @@QQ/) - print "Bad Quick Quiz command: " NR " (expected

    @@QQE@@)." > "/dev/stderr" - next; -} - -state == "qqa" && $1 == "

    @@QQE@@" { - state = ""; - next; -} - -state == "" && $1 == "

    @@QQAL@@" { - haveqq = ""; - print "

    " - print "Answers to Quick Quizzes

    " - print ""; - for (i = 1; i <= qqn; i++) { - print "" - print "

    Quick Quiz " i ":" - print qq[i]; - print ""; - print "

    Answer:" - print qqa[i]; - print ""; - print "

    Back to Quick Quiz " i "." - print ""; - } - next; -} - -END { - if (state != "") - print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" - else if (haveqq) - print "Missing \"

    @@QQAL@@\", no Quick Quiz." > "/dev/stderr" -}' -- GitLab From 5413e24c943da33306047fc091fa34fa4f261b3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Mar 2016 13:40:28 -0700 Subject: [PATCH 073/705] documentation: Sharpen up the no-readers quick quiz Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index acdad96f78e9c..85cf2238fd083 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1002,18 +1002,21 @@ obligation to wait for these new readers.   Quick Quiz: - Suppose that synchronize_rcu() did wait until all readers had completed. - Would the updater be able to rely on this? + Suppose that synchronize_rcu() did wait until all + readers had completed instead of waiting only on + pre-existing readers. + For how long would the updater be able to rely on there + being no readers? Answer: - No. + For no time at all. Even if synchronize_rcu() were to wait until all readers had completed, a new reader might start immediately after synchronize_rcu() completed. Therefore, the code following - synchronize_rcu() cannot rely on there being no readers - in any case. + synchronize_rcu() can never rely on there being + no readers.   -- GitLab From 0c7d10e4b998b2f751cebf98435f1ec2dd312c87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Mar 2016 11:00:08 -0700 Subject: [PATCH 074/705] documentation: Emphasize the call_rcu() is illegal from idle Although call_rcu()'s fastpath works just fine on an idle CPU, some branches of the slowpath invoke the scheduler, which uses RCU. Therefore, this commit emphasizes the fact that call_rcu() must not be invoked from an idle CPU. Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 85cf2238fd083..e7e24b3e86e29 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1650,7 +1650,7 @@ situations where neither synchronize_rcu() nor including within preempt-disable code, local_bh_disable() code, interrupt-disable code, and interrupt handlers. However, even call_rcu() is illegal within NMI handlers -and from offline CPUs. +and from idle and offline CPUs. The callback function (remove_gp_cb() in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, -- GitLab From 28728dd310d48834cd486dac3cac9ae96b9deb96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 08:33:37 -0800 Subject: [PATCH 075/705] rcu: Make expedited RCU-sched grace period immediately detect idle Currently, sync_sched_exp_handler() will force a reschedule unless this CPU has already checked in or unless a reschedule has already been called for. This is clearly wasteful if sync_sched_exp_handler() interrupted an idle CPU, so this commit immediately reports the quiescent state in that case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bdd..5f4336fadc288 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3649,6 +3649,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } -- GitLab From 251c617c75f48e03523c43c4ce1dff44bc3ae2bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Jan 2016 10:52:35 -0800 Subject: [PATCH 076/705] rcu: Make expedited RCU-preempt stall warnings count accurately Currently, synchronize_sched_expedited_wait() simply sets the ndetected variable to the rcu_print_task_exp_stall() return value. This means that if the last rcu_node structure has no stalled tasks, record of any stalled tasks in previous rcu_node structures is lost, which can in turn result in failure to dump out the blocking rcu_node structures. Or could, had the test been correct. This commit therefore adds the return value of rcu_print_task_exp_stall() to ndetected and corrects the later test for ndetected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f4336fadc288..687d8a5f35c7b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3778,7 +3778,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3797,7 +3797,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) -- GitLab From a1e1224849d9610b50fd1dd7d6f44308a59e46af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Jan 2016 13:57:54 -0800 Subject: [PATCH 077/705] rcu: Make cond_resched_rcu_qs() supply RCU-sched expedited QS Although cond_resched_rcu_qs() supplies quiescent states to all flavors of normal RCU grace periods, it does nothing for expedited RCU-sched grace periods. This commit therefore adds a check for a need for a quiescent state from the current CPU by an expedited RCU-sched grace period, and invokes rcu_sched_qs() to supply that quiescent state if so. Note that the check is racy in that we might be migrated to some other CPU just after checking the per-CPU variable. This is OK because the act of migration will do a context switch, which will supply the needed quiescent state. The only downside is that we might do an unnecessary call to rcu_sched_qs(), but the probability is low and the overhead is small. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 687d8a5f35c7b..178575c01d092 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -370,6 +370,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } -- GitLab From 26ece8ef6eca97f19eb5ad5186b8c1a29ab25d76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 18:48:37 -0800 Subject: [PATCH 078/705] rcu: Fix synchronize_rcu_expedited() header comment This commit brings the synchronize_rcu_expedited() function's header comment into line with the new implementation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce120..a2ac2628ef8ea 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -722,13 +722,19 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { -- GitLab From e087816db9423fdc49302d3cd7ec01e487477a71 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:25:09 -0800 Subject: [PATCH 079/705] rcu: Add event tracing definitions for expedited grace periods Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 78 +++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ef72c4aada566..aacc172eba7ee 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -171,6 +171,76 @@ TRACE_EVENT(rcu_grace_period_init, __entry->grplo, __entry->grphi, __entry->qsmask) ); +/* + * Tracepoint for expedited grace-period events. Takes a string identifying + * the RCU flavor, the expedited grace-period sequence number, and a string + * identifying the grace-period-related event as follows: + * + * "snap": Captured snapshot of expedited grace period sequence number. + * "start": Started a real expedited grace period. + * "end": Ended a real expedited grace period. + * "done": Someone else did the expedited grace period for us. + */ +TRACE_EVENT(rcu_exp_grace_period, + + TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent), + + TP_ARGS(rcuname, gpseq, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, gpseq) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->gpseq = gpseq; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %lu %s", + __entry->rcuname, __entry->gpseq, __entry->gpevent) +); + +/* + * Tracepoint for expedited grace-period funnel-locking events. Takes a + * string identifying the RCU flavor, an integer identifying the rcu_node + * combining-tree level, another pair of integers identifying the lowest- + * and highest-numbered CPU associated with the current rcu_node structure, + * and a string. identifying the grace-period-related event as follows: + * + * "acq": Acquired a level of funnel lock + * "rel": Released a level of funnel lock + */ +TRACE_EVENT(rcu_exp_funnel_lock, + + TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi, + const char *gpevent), + + TP_ARGS(rcuname, level, grplo, grphi, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(u8, level) + __field(int, grplo) + __field(int, grphi) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->level = level; + __entry->grplo = grplo; + __entry->grphi = grphi; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %d %d %d %s", + __entry->rcuname, __entry->level, __entry->grplo, + __entry->grphi, __entry->gpevent) +); + /* * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended * to assist debugging of these handoffs. @@ -704,11 +774,15 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ - qsmask) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) +#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \ + do { } while (0) +#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ + do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) -- GitLab From bea2de44ae647698dc848a671fdee6e53c192423 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:30:06 -0800 Subject: [PATCH 080/705] rcu: Add funnel-locking tracing for expedited grace periods Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++++++---- kernel/rcu/tree_plugin.h | 3 +++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 178575c01d092..79e9206a7b11c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,10 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) + if (rnp) { mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("rel")); + } else if (rdp) { mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, + rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, + TPS("rel")); + } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3619,6 +3627,9 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (sync_exp_work_done(rsp, rnp0, NULL, &rdp->expedited_workdone0, s)) return NULL; + trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, + rnp0->grplo, rnp0->grphi, + TPS("acq")); return rnp0; } } @@ -3634,16 +3645,28 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, TPS("acq")); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) + trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, + rnp0->grplo, rnp0->grphi, TPS("acq")); + if (rnp1) { mutex_unlock(&rnp1->exp_funnel_mutex); - else + trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, + rnp1->grplo, rnp1->grphi, + TPS("rel")); + } else { mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, + rdp->mynode->level + 1, + rdp->cpu, rdp->cpu, + TPS("rel")); + } rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a2ac2628ef8ea..cd2dae43ff48f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -767,6 +767,9 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp_unlock->exp_funnel_mutex); + trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, + rnp_unlock->grplo, rnp_unlock->grphi, + TPS("rel")); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- GitLab From 4f41530245c7fd4837152e264d120d05ae940eb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 20:49:49 -0800 Subject: [PATCH 081/705] rcu: Add expedited-grace-period event tracing Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++------- kernel/rcu/tree_plugin.h | 3 +++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79e9206a7b11c..524026fd9dd7f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,17 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); if (rnp) { - mutex_unlock(&rnp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("rel")); + mutex_unlock(&rnp->exp_funnel_mutex); } else if (rdp) { - mutex_unlock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("rel")); + mutex_unlock(&rdp->exp_funnel_mutex); } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ @@ -3624,12 +3625,12 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp0 = rcu_get_root(rsp); if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, rnp0->grplo, rnp0->grphi, TPS("acq")); + if (sync_exp_work_done(rsp, rnp0, NULL, + &rdp->expedited_workdone0, s)) + return NULL; return rnp0; } } @@ -3656,16 +3657,16 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, rnp0->grplo, rnp0->grphi, TPS("acq")); if (rnp1) { - mutex_unlock(&rnp1->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, rnp1->grplo, rnp1->grphi, TPS("rel")); + mutex_unlock(&rnp1->exp_funnel_mutex); } else { - mutex_unlock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("rel")); + mutex_unlock(&rdp->exp_funnel_mutex); } rnp1 = rnp0; } @@ -3895,16 +3896,21 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); rnp = exp_funnel_lock(rsp, s); if (rnp == NULL) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, TPS("rel")); mutex_unlock(&rnp->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cd2dae43ff48f..36e94aed38a7c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -750,12 +750,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); rnp_unlock = exp_funnel_lock(rsp, s); if (rnp_unlock == NULL) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); @@ -766,6 +768,7 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); mutex_unlock(&rnp_unlock->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, rnp_unlock->grplo, rnp_unlock->grphi, -- GitLab From e2fd9d35847d1936398d44c4df68dceb3d7f64e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 17:23:19 -0800 Subject: [PATCH 082/705] rcu: Remove expedited GP funnel-lock bypass Commit #cdacbe1f91264 ("rcu: Add fastpath bypassing funnel locking") turns out to be a pessimization at high load because it forces a tree full of tasks to wait for an expedited grace period that they probably do not need. This commit therefore removes this optimization. Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 10 +++++----- kernel/rcu/tree.c | 19 ------------------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_trace.c | 7 +++---- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index ec6998b1b6d04..00a3a38b375ae 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: o "s" is the sequence number, with an odd number indicating that an expedited grace period is in progress. -o "wd0", "wd1", "wd2", and "wd3" are the number of times that an - attempt to start an expedited grace period found that someone - else had completed an expedited grace period that satisfies the - attempted request. "Our work is done." +o "wd1", "wd2", and "wd3" are the number of times that an attempt + to start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the attempted + request. "Our work is done." o "n" is number of times that a concurrent CPU-hotplug operation forced a fallback to a normal grace period. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 524026fd9dd7f..62e73e0a929f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3616,25 +3616,6 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; - /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. - */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, - rnp0->grplo, rnp0->grphi, - TPS("acq")); - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; - } - } - /* * Each pass through the following loop works its way * up the rcu_node tree, returning if others have done the diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e649..ac9a7b0c36aea 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -388,7 +388,6 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad8..d149c412a4e51 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); s1 += atomic_long_read(&rdp->expedited_workdone1); s2 += atomic_long_read(&rdp->expedited_workdone2); s3 += atomic_long_read(&rdp->expedited_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- GitLab From ec3833ed02ae6ef2a933ece9de7cbab0c64c699e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jan 2016 16:29:29 -0800 Subject: [PATCH 083/705] rcu: Force boolean subscript for expedited stall warnings The cpu_online() function can return values other than 0 and 1, which can result in subscript overflow when applied to a two-element array. This commit allows for this behavior by using "!!" on the return value from cpu_online() when used as a subscript. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 62e73e0a929f9..64c2e32885513 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3808,7 +3808,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } -- GitLab From d40a4f09a448382961fa9b1a2f7d4f34813f0273 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2016 14:43:44 -0800 Subject: [PATCH 084/705] rcu: Shorten expedited_workdone* to exp_workdone* Just a name change to save a few lines and a bit of typing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++----- kernel/rcu/tree.h | 6 +++--- kernel/rcu/tree_trace.c | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 64c2e32885513..89f0287677657 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3624,15 +3624,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, rdp->cpu, rdp->cpu, TPS("acq")); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, @@ -3651,8 +3650,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } rnp1 = rnp0; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s)) return NULL; return rnp1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ac9a7b0c36aea..6a8f094469249 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -388,9 +388,9 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d149c412a4e51..86782f9a46043 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -189,9 +189,9 @@ static int show_rcuexp(struct seq_file *m, void *v) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, s1, s2, s3, -- GitLab From f6a12f34a448cc8a624070fd365c29c890138a48 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 17:57:35 -0800 Subject: [PATCH 085/705] rcu: Enforce expedited-GP fairness via funnel wait queue The current mutex-based funnel-locking approach used by expedited grace periods is subject to severe unfairness. The problem arises when a few tasks, making a path from leaves to root, all wake up before other tasks do. A new task can then follow this path all the way to the root, which needlessly delays tasks whose grace period is done, but who do not happen to acquire the lock quickly enough. This commit avoids this problem by maintaining per-rcu_node wait queues, along with a per-rcu_node counter that tracks the latest grace period sought by an earlier task to visit this node. If that grace period would satisfy the current task, instead of proceeding up the tree, it waits on the current rcu_node structure using a pair of wait queues provided for that purpose. This decouples awakening of old tasks from the arrival of new tasks. If the wakeups prove to be a bottleneck, additional kthreads can be brought to bear for that purpose. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 5 +- kernel/rcu/tree.c | 155 +++++++++++++++++++------------------ kernel/rcu/tree.h | 10 +-- kernel/rcu/tree_plugin.h | 16 ++-- 4 files changed, 93 insertions(+), 93 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index aacc172eba7ee..d3e756539d44c 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -179,6 +179,7 @@ TRACE_EVENT(rcu_grace_period_init, * "snap": Captured snapshot of expedited grace period sequence number. * "start": Started a real expedited grace period. * "end": Ended a real expedited grace period. + * "endwake": Woke piggybackers up. * "done": Someone else did the expedited grace period for us. */ TRACE_EVENT(rcu_exp_grace_period, @@ -210,8 +211,8 @@ TRACE_EVENT(rcu_exp_grace_period, * and highest-numbered CPU associated with the current rcu_node structure, * and a string. identifying the grace-period-related event as follows: * - * "acq": Acquired a level of funnel lock - * "rel": Released a level of funnel lock + * "nxtlvl": Advance to next level of rcu_node funnel + * "wait": Wait for someone else to do expedited GP */ TRACE_EVENT(rcu_exp_funnel_lock, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 89f0287677657..bd2658edce006 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,7 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -3484,7 +3485,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3500,8 +3501,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3538,7 +3539,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3551,8 +3552,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3570,7 +3571,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3579,24 +3579,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - if (rnp) { - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("rel")); - mutex_unlock(&rnp->exp_funnel_mutex); - } else if (rdp) { - trace_rcu_exp_funnel_lock(rsp->name, - rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, - TPS("rel")); - mutex_unlock(&rdp->exp_funnel_mutex); - } /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3606,53 +3593,53 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, TPS("acq")); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, - rnp0->grplo, rnp0->grphi, TPS("acq")); - if (rnp1) { - trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, - rnp1->grplo, rnp1->grphi, - TPS("rel")); - mutex_unlock(&rnp1->exp_funnel_mutex); - } else { - trace_rcu_exp_funnel_lock(rsp->name, - rdp->mynode->level + 1, - rdp->cpu, rdp->cpu, - TPS("rel")); - mutex_unlock(&rdp->exp_funnel_mutex); + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x1], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } - rnp1 = rnp0; + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s)) - return NULL; - return rnp1; + mutex_lock(&rsp->exp_mutex); + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3841,6 +3828,27 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); + } +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3860,7 +3868,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3877,20 +3884,23 @@ void synchronize_sched_expedited(void) s = rcu_exp_gp_seq_snap(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + + /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); + /* Wait and clean up, including waking everyone. */ + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, TPS("rel")); - mutex_unlock(&rnp->exp_funnel_mutex); + rcu_exp_wake(rsp, s); + + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4190,7 +4200,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4448,10 +4457,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4510,9 +4517,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a8f094469249..f9d4fbb1e014e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,7 +248,9 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[2]; } ____cacheline_internodealigned_in_smp; /* @@ -387,7 +385,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ @@ -504,6 +501,7 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 36e94aed38a7c..c82c3640493f3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -738,8 +738,6 @@ static void sync_rcu_exp_handler(void *info) */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -752,8 +750,7 @@ void synchronize_rcu_expedited(void) s = rcu_exp_gp_seq_snap(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); @@ -763,16 +760,13 @@ void synchronize_rcu_expedited(void) sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); - trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, - rnp_unlock->grplo, rnp_unlock->grphi, - TPS("rel")); + rcu_exp_wake(rsp, s); + + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- GitLab From 356051e1de3cf65575da4ee92d1f5cee86677ee2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 13:22:53 -0700 Subject: [PATCH 086/705] rcu: Add exp_funnel_lock() fastpath This commit speeds up the low-contention case, especially for systems with large rcu_node trees, by attempting to directly acquire the ->exp_mutex. This fastpath checks the leaves and root first in order to avoid excessive memory contention on the mutex itself. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd2658edce006..892a140ae7b6d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3603,6 +3603,15 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* * Each pass through the following loop works its way up @@ -3635,6 +3644,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grphi, TPS("nxtlvl")); } mutex_lock(&rsp->exp_mutex); +fastpath: if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { mutex_unlock(&rsp->exp_mutex); return true; -- GitLab From 4ea3e85b113ab37a2d55cfabf0d709ddec088bb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:22:25 -0700 Subject: [PATCH 087/705] rcu: Consolidate expedited GP code into rcu_exp_wait_wake() Currently, synchronize_rcu_expedited() and rcu_sched_expedited() have significant duplicate code. This commit therefore consolidates some of this code into rcu_exp_wake(), which is now renamed to rcu_exp_wait_wake() in recognition of its added responsibilities. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- kernel/rcu/tree_plugin.h | 10 ++-------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 892a140ae7b6d..fd86eca9478e7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3839,14 +3839,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } /* - * Wake up everyone who piggybacked on the just-completed expedited + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited * grace period. Also update all the ->exp_seq_rq counters as needed * in order to avoid counter-wrap problems. */ -static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) { struct rcu_node *rnp; + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); @@ -3857,6 +3861,8 @@ static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s) } wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_mutex); } /** @@ -3904,13 +3910,7 @@ void synchronize_sched_expedited(void) sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); /* Wait and clean up, including waking everyone. */ - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - rcu_exp_wake(rsp, s); - - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c82c3640493f3..b6d5dde6eab99 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -759,14 +759,8 @@ void synchronize_rcu_expedited(void) /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - rcu_exp_wake(rsp, s); - - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- GitLab From 179e5dcd1e5bdfac1128431d131b31322aedd2bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:27:44 -0700 Subject: [PATCH 088/705] rcu: Consolidate expedited GP tracing into rcu_exp_gp_seq_snap() This commit moves some duplicate code from synchronize_rcu_expedited() and synchronize_sched_expedited() into rcu_exp_gp_seq_snap(). This doesn't save lines of code, but does eliminate a "tell me twice" issue. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- kernel/rcu/tree_plugin.h | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fd86eca9478e7..5b1c8fd89af01 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3392,8 +3392,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3898,8 +3902,6 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b6d5dde6eab99..529a44085a636 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -748,8 +748,6 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ -- GitLab From aff12cdf86e6fa891d1c30c0fad112d138bd7b10 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:32:24 -0700 Subject: [PATCH 089/705] rcu: Consolidate expedited GP code into exp_funnel_lock() This commit pulls the grace-period-start counter adjustment and tracing from synchronize_rcu_expedited() and synchronize_sched_expedited() into exp_funnel_lock(), thus eliminating some code duplication. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++--- kernel/rcu/tree_plugin.h | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5b1c8fd89af01..e8fff14e417b2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3653,6 +3653,8 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_mutex); return true; } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); return false; } @@ -3905,9 +3907,6 @@ void synchronize_sched_expedited(void) if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 529a44085a636..ff1cd4e1188d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -751,9 +751,6 @@ void synchronize_rcu_expedited(void) if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); -- GitLab From 3b5f668e715bc19610ad967ef97a7e8c55a186ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Mar 2016 16:47:55 -0700 Subject: [PATCH 090/705] rcu: Overlap wakeups with next expedited grace period The current expedited grace-period implementation makes subsequent grace periods wait on wakeups for the prior grace period. This does not fit the dictionary definition of "expedited", so this commit allows these two phases to overlap. Doing this requires four waitqueues rather than two because tasks can now be waiting on the previous, current, and next grace periods. The fourth waitqueue makes the bit masking work out nicely. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++++++--- kernel/rcu/tree.h | 3 ++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e8fff14e417b2..1df100cb7a624 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -3637,7 +3638,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x1], + wait_event(rnp->exp_wq[(s >> 1) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -3857,6 +3858,14 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); @@ -3865,10 +3874,10 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]); + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_mutex); + mutex_unlock(&rsp->exp_wake_mutex); } /** @@ -4530,6 +4539,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rcu_init_one_nocb(rnp); init_waitqueue_head(&rnp->exp_wq[0]); init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f9d4fbb1e014e..1194ab0da56ac 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -250,7 +250,7 @@ struct rcu_node { spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; - wait_queue_head_t exp_wq[2]; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -502,6 +502,7 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ -- GitLab From 86057b80ae31d37fcbdb5f57d15aaf1148c69f96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 08:48:36 -0800 Subject: [PATCH 091/705] rcu: Awaken grace-period kthread when stalled Recent kernels can fail to awaken the grace-period kthread for quiescent-state forcing. This commit is a crude hack that does a wakeup any time a stall is detected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bdd..a327a253c178e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1224,8 +1224,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } -- GitLab From fcfd0a237bfcf0c314005007e9d76e55a25e2bad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Jan 2016 16:42:18 -0800 Subject: [PATCH 092/705] rcu: Make FQS schedule advance only if FQS happened Currently, the force-quiescent-state (FQS) code in rcu_gp_kthread() can advance the next FQS even if one was not executed last time. This can happen due timeout-duration uncertainty. This commit therefore avoids advancing the FQS schedule unless an FQS was just executed. In the corner case where an FQS was not executed, but is due now, the code does a one-jiffy wait. This change prepares for kthread kicking. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a327a253c178e..6116cfad18fff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2146,6 +2146,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2154,14 +2163,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } -- GitLab From 8c7c4829a81c1838f18c12ce5a3a5c29a08bf0a8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Jan 2016 20:29:57 -0800 Subject: [PATCH 093/705] rcu: Awaken grace-period kthread if too long since FQS Recent kernels can fail to awaken the grace-period kthread for quiescent-state forcing. This commit is a crude hack that does a wakeup if a scheduling-clock interrupt sees that it has been too long since force-quiescent-state (FQS) processing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6116cfad18fff..a739292be6058 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -385,9 +385,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -1251,6 +1253,24 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1262,6 +1282,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1335,6 +1360,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1379,8 +1409,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -2119,8 +2151,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e649..34d3973f72235 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -513,6 +513,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ -- GitLab From 293e2421fe25839500207eda123cc4475f8d17b8 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 23 Mar 2016 23:11:48 +0800 Subject: [PATCH 094/705] rcu: Remove superfluous versions of rcu_read_lock_sched_held() Currently, we have four versions of rcu_read_lock_sched_held(), depending on the combined choices on PREEMPT_COUNT and DEBUG_LOCK_ALLOC. However, there is an existing function preemptible() that already distinguishes between the PREEMPT_COUNT=y and PREEMPT_COUNT=n cases, and allows these four implementations to be consolidated down to two. This commit therefore uses preemptible() to achieve this consolidation. Note that there could be a small performance regression in the case of CONFIG_DEBUG_LOCK_ALLOC=y && PREEMPT_COUNT=n. However, given the overhead associated with CONFIG_DEBUG_LOCK_ALLOC=y, this should be down in the noise. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 17 +---------------- kernel/rcu/update.c | 4 ++-- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 45de591657a6f..5f1533e3d0320 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void); * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. */ -#ifdef CONFIG_PREEMPT_COUNT int rcu_read_lock_sched_held(void); -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { - return preempt_count() != 0 || irqs_disabled(); + return !preemptible(); } -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938b..3ccdc8eebc5af 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif -- GitLab From 5dffed1e5721f6deae4fd67d32386ef037c5fc56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2016 11:54:28 -0800 Subject: [PATCH 095/705] rcu: Dump ftrace buffer when kicking grace-period kthread If it is necessary to kick the grace-period kthread, that is a good time to dump the trace buffer in order to learn why kicking was needed. This commit therefore does the dump. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a739292be6058..86edb92276d38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1266,6 +1266,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) j = READ_ONCE(rsp->jiffies_kick_kthreads); if (time_after(jiffies, j) && rsp->gp_kthread) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); } -- GitLab From fd35be623a1534bde57029c429b206d6c22a1ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jan 2016 13:13:12 -0800 Subject: [PATCH 096/705] rcutorture: Update scripting to accommodate rcuperf This commit adds the scripting changes to add support for the shiny new rcuperf kernel module. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-rcuperf.sh | 82 +++++++++++++++++++ .../selftests/rcutorture/bin/kvm-recheck.sh | 5 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- .../rcutorture/configs/rcuperf/CFLIST | 1 + .../rcutorture/configs/rcuperf/CFcommon | 2 + .../selftests/rcutorture/configs/rcuperf/TREE | 19 +++++ .../configs/rcuperf/ver_functions.sh | 52 ++++++++++++ 7 files changed, 161 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh new file mode 100755 index 0000000000000..e5b28174fda0d --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements. +# +# Usage: kvm-recheck-rcuperf.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +if test -d $i +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +. tools/testing/selftests/rcutorture/bin/functions.sh + +configfile=`echo $i | sed -e 's/^.*\///'` + +grep -e '-perf:.*writer-duration' $i/console.log | sed -e 's/^\[[^]]*]//' | +awk ' +{ + gptimes[++n] = $5 / 1000.; + sum += $5 / 1000.; +} + +END { + if (NR <= 0) { + print "No rcuperf records found???" + exit; + } + asort(gptimes); + pct50 = int(NR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(NR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(NR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= NR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Average grace-period duration: " sum / NR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[NR]; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index d86bdd6b6cc2d..f659346d33585 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -48,7 +48,10 @@ do cat $i/Make.oldconfig.err fi parse-build.sh $i/Make.out $configfile - parse-torture.sh $i/console.log $configfile + if test "$TORTURE_SUITE" != rcuperf + then + parse-torture.sh $i/console.log $configfile + fi parse-console.sh $i/console.log $configfile if test -r $i/Warnings then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 4a431767f77a0..c33cb582b3dcb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -156,7 +156,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' TORTURE_SUITE=$2 shift ;; diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST new file mode 100644 index 0000000000000..c9f56cf20775b --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST @@ -0,0 +1 @@ +TREE diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon new file mode 100644 index 0000000000000..a09816b8c0f3f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_PERF_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE new file mode 100644 index 0000000000000..614e107f6db5c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -0,0 +1,19 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh new file mode 100644 index 0000000000000..34f2a1b35ee5e --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney + +# rcuperf_param_nreaders bootparam-string +# +# Adds nreaders rcuperf module parameter if not already specified. +rcuperf_param_nreaders () { + if ! echo "$1" | grep -q "rcuperf.nreaders" + then + echo rcuperf.nreaders=-1 + fi +} + +# rcuperf_param_nwriters bootparam-string +# +# Adds nwriters rcuperf module parameter if not already specified. +rcuperf_param_nwriters () { + if ! echo "$1" | grep -q "rcuperf.nwriters" + then + echo rcuperf.nwriters=-1 + fi +} + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 `rcuperf_param_nreaders "$1"` \ + `rcuperf_param_nwriters "$1"` \ + rcuperf.perf_runnable=1 \ + rcuperf.shutdown=1 \ + rcuperf.verbose=1 +} -- GitLab From 9efafb8849f732a3497f46f178b350c9ff7cfe27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 18:11:47 -0800 Subject: [PATCH 097/705] rcutorture: Allow for rcupdate.rcu_normal Currently, rcu_torture_writer() checks only for rcu_gp_is_expedited() when deciding whether or not to do dynamic control of RCU expediting. This means that if rcupdate.rcu_normal is specified, rcu_torture_writer() will attempt to dynamically control RCU expediting, but will nonetheless only test normal RCU grace periods. This commit therefore adds a check for !rcu_gp_is_normal(), and prints a message and desists from testing dynamic control of RCU expediting when doing so is futile. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 463867c432216..9234e75b106ae 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +932,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", -- GitLab From 291783b8ad77a83a6fdf91d55eee7f1ad72ed4d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 13:43:30 -0800 Subject: [PATCH 098/705] rcutorture: Expedited-GP batch progress access to torturing This commit provides rcu_exp_batches_completed() and rcu_exp_batches_completed_sched() functions to allow torture-test modules to check how many expedited grace period batches have completed. These are analogous to the existing rcu_batches_completed(), rcu_batches_completed_bh(), and rcu_batches_completed_sched() functions. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 16 ++++++++++++++++ include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 22 ++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 64809aea661ce..93aea75029fbd 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void) return 0; } +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index ad1eda9fa4dae..5043cb823fb27 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 531a328076bdd..88df64087dfea 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -459,6 +459,28 @@ unsigned long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + /* * Force a quiescent state. */ -- GitLab From 8704baab9bc848b58c129fed6b591bb84ec02f41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 18:33:22 -0800 Subject: [PATCH 099/705] rcutorture: Add RCU grace-period performance tests This commit adds a new rcuperf module that carries out simple performance tests of RCU grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 1 + kernel/rcu/rcuperf.c | 637 +++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 33 +++ 3 files changed, 671 insertions(+) create mode 100644 kernel/rcu/rcuperf.c diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb6..18dfc485225c3 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 0000000000000..9d54a57bee7d0 --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,637 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, NULL, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e9a607534ca0..f4b797a690ba1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,6 +1289,39 @@ config TORTURE_TEST tristate default n +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_PERF_TEST_RUNNABLE + bool "performance tests for RCU runnable by default" + depends on RCU_PERF_TEST = y + default n + help + This option provides a way to build the RCU performance tests + directly into the kernel without them starting up at boot time. + You can use /sys/module to manually override this setting. + This /proc file is available only when the RCU performance + tests have been built into the kernel. + + Say Y here if you want the RCU performance tests to start during + boot (you probably don't). + Say N here if you want the RCU performance tests to start only + after being manually enabled via /sys/module. + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL -- GitLab From bdea9e347783c2724997db7c5d5b45a301e2dc90 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Jan 2016 13:47:19 -0800 Subject: [PATCH 100/705] rcutorture: Documentation for rcuperf kernel parameters This commit adds documentation for the new rcuperf module's kernel boot parameters. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ecc74fa4bfde8..951af481da5a6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3284,6 +3284,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcuperf.gp_exp= [KNL] + Measure performance of expedited synchronous + grace-period primitives. + + rcuperf.nreaders= [KNL] + Set number of RCU readers. The value -1 selects + N, where N is the number of CPUs. A value + "n" less than -1 selects N-n+1, where N is again + the number of CPUs. For example, -2 selects N + (the number of CPUs), -3 selects N+1, and so on. + A value of "n" less than or equal to -N selects + a single reader. + + rcuperf.nwriters= [KNL] + Set number of RCU writers. The values operate + the same as for rcuperf.nreaders. + N, where N is the number of CPUs + + rcuperf.perf_runnable= [BOOT] + Start rcuperf running at boot time. + + rcuperf.shutdown= [KNL] + Shut the system down after performance tests + complete. This is useful for hands-off automated + testing. + + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + + rcuperf.verbose= [KNL] + Enable additional printk() statements. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. -- GitLab From 6b558c4c7a4ba410e39dbcb9d4c2b6e928c09308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 14:15:40 -0800 Subject: [PATCH 101/705] rcutorture: Bind rcuperf reader/writer kthreads to CPUs This commit forces more deterministic behavior by binding rcuperf's rcu_perf_reader() and rcu_perf_writer() kthreads to their respective CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 9d54a57bee7d0..7a1edf417d187 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -328,8 +328,10 @@ rcu_perf_reader(void *arg) { unsigned long flags; int idx; + long me = (long)arg; VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_rcu_perf_reader_started); @@ -362,6 +364,7 @@ rcu_perf_writer(void *arg) WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; @@ -594,7 +597,7 @@ rcu_perf_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, NULL, + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; -- GitLab From 2094c99558d9e9374210898f65f5862f7a2e8bed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 15:17:21 -0800 Subject: [PATCH 102/705] rcutorture: Set rcuperf writer kthreads to real-time priority This commit forces more deterministic update-side behavior by setting rcuperf's rcu_perf_writer() kthreads to real-time priority. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a1edf417d187..e18d016a98886 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -355,6 +355,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -365,6 +366,8 @@ rcu_perf_writer(void *arg) WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; -- GitLab From e588f35492227cc4ab2cbfe95fd5f993a5086f9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Jan 2016 17:26:35 -0800 Subject: [PATCH 103/705] rcutorture: Print measure of batching efficiency This commit adds a line giving the number of grace periods, the number of batches, and the ratio. The larger the ratio, the greater the batching efficiency. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-rcuperf.sh | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index e5b28174fda0d..1f72df8eedc74 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -34,33 +34,38 @@ fi configfile=`echo $i | sed -e 's/^.*\///'` -grep -e '-perf:.*writer-duration' $i/console.log | sed -e 's/^\[[^]]*]//' | +sed -e 's/^\[[^]]*]//' < $i/console.log | awk ' -{ +/-perf: .* gps: .* batches:/ { + ngps = $9; + nbatches = $11; +} + +/-perf: .*writer-duration/ { gptimes[++n] = $5 / 1000.; sum += $5 / 1000.; } END { - if (NR <= 0) { + newNR = asort(gptimes); + if (newNR <= 0) { print "No rcuperf records found???" exit; } - asort(gptimes); - pct50 = int(NR * 50 / 100); + pct50 = int(newNR * 50 / 100); if (pct50 < 1) pct50 = 1; - pct90 = int(NR * 90 / 100); + pct90 = int(newNR * 90 / 100); if (pct90 < 1) pct90 = 1; - pct99 = int(NR * 99 / 100); + pct99 = int(newNR * 99 / 100); if (pct99 < 1) pct99 = 1; div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; print "Histogram bucket size: " div; last = gptimes[1] - 10; count = 0; - for (i = 1; i <= NR; i++) { + for (i = 1; i <= newNR; i++) { current = div * int(gptimes[i] / div); if (last == current) { count++; @@ -73,10 +78,11 @@ END { } if (count > 0) print last, count; - print "Average grace-period duration: " sum / NR " microseconds"; + print "Average grace-period duration: " sum / newNR " microseconds"; print "Minimum grace-period duration: " gptimes[1]; print "50th percentile grace-period duration: " gptimes[pct50]; print "90th percentile grace-period duration: " gptimes[pct90]; print "99th percentile grace-period duration: " gptimes[pct99]; - print "Maximum grace-period duration: " gptimes[NR]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; }' -- GitLab From ac2bb275e8e5abddb0815ff2b7aa383ed6d007a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 Jan 2016 14:58:17 -0800 Subject: [PATCH 104/705] rcutorture: Make rcuperf collect expedited event-trace data This commit enables ftrace in the rcuperf TREE kernel build and adds an ftrace_dump() at the end of rcuperf processing. This data will be used to measure the actual durations of the expedited grace periods without the added delays inherent in the kernel-module measurements. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 1 + tools/testing/selftests/rcutorture/configs/rcuperf/TREE | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e18d016a98886..12561f96f0a29 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -404,6 +404,7 @@ rcu_perf_writer(void *arg) perf_type, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { + rcu_ftrace_dump(DUMP_ALL); PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE index 614e107f6db5c..a312f671a29a4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -17,3 +17,4 @@ CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y -- GitLab From 2b03d038457fc8d694d34981cb0a2f1702ba35d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 16:51:36 -0800 Subject: [PATCH 105/705] rcutorture: Make scripts analyze rcuperf trace data, if present The rcuperf event-trace data is more accurate than are the rcuperf printk()s because locking keeps things ordered. This commit therefore parses and analyzes this event-trace data if present, and falls back on the printk()s otherwise. Signed-off-by: Paul E. McKenney --- .../bin/kvm-recheck-rcuperf-ftrace.sh | 121 ++++++++++++++++++ .../rcutorture/bin/kvm-recheck-rcuperf.sh | 8 ++ 2 files changed, 129 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh new file mode 100755 index 0000000000000..f79b0e9e84fcf --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements, +# looking for ftrace data. Exits with 0 if data was found, analyzed, and +# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after +# argument checking. +# +# Usage: kvm-recheck-rcuperf-ftrace.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +i="$1" +. tools/testing/selftests/rcutorture/bin/functions.sh + +if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100 +then + exit 10 +fi + +sed -e 's/^\[[^]]*]//' < $i/console.log | +grep 'us : rcu_exp_grace_period' | +sed -e 's/us : / : /' | +tr -d '\015' | +awk ' +$8 == "start" { + if (starttask != "") + nlost++; + starttask = $1; + starttime = $3; + startseq = $7; +} + +$8 == "end" { + if (starttask == $1 && startseq == $7) { + curgpdur = $3 - starttime; + gptimes[++n] = curgpdur; + gptaskcnt[starttask]++; + sum += curgpdur; + if (curgpdur > 1000) + print "Long GP " starttime "us to " $3 "us (" curgpdur "us)"; + starttask = ""; + } else { + # Lost a message or some such, reset. + starttask = ""; + nlost++; + } +} + +$8 == "done" { + piggybackcnt[$1]++; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No ftrace records found???" + exit 10; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Distribution of grace periods across tasks:"; + for (i in gptaskcnt) { + print "\t" i, gptaskcnt[i]; + nbatches += gptaskcnt[i]; + } + ngps = nbatches; + print "Distribution of piggybacking across tasks:"; + for (i in piggybackcnt) { + print "\t" i, piggybackcnt[i]; + ngps += piggybackcnt[i]; + } + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0; + print "Computed from ftrace data."; +}' +exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh index 1f72df8eedc74..8f3121afc716d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -30,8 +30,15 @@ else echo Unreadable results directory: $i exit 1 fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . tools/testing/selftests/rcutorture/bin/functions.sh +if kvm-recheck-rcuperf-ftrace.sh $i +then + # ftrace data was successfully analyzed, call it good! + exit 0 +fi + configfile=`echo $i | sed -e 's/^.*\///'` sed -e 's/^\[[^]]*]//' < $i/console.log | @@ -85,4 +92,5 @@ END { print "99th percentile grace-period duration: " gptimes[pct99]; print "Maximum grace-period duration: " gptimes[newNR]; print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; + print "Computed from rcuperf printk output."; }' -- GitLab From df37e66bfdbb57e8cae7dbf39a0c66b1b8701338 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 20:56:38 -0800 Subject: [PATCH 106/705] rcutorture: Add rcuperf holdoff boot parameter to reduce interference Boot-time activity can legitimately grab CPUs for extended time periods, so the commit adds a boot parameter to delay the start of the performance test until boot has completed. Defaults to 10 seconds. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++++++ kernel/rcu/rcuperf.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 951af481da5a6..da9ee466789b9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3288,6 +3288,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Measure performance of expedited synchronous grace-period primitives. + rcuperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 12561f96f0a29..278600143bb6d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -59,6 +59,7 @@ MODULE_AUTHOR("Paul E. McKenney "); do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); @@ -368,6 +369,10 @@ rcu_perf_writer(void *arg) set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; -- GitLab From 620316e52a923811fe9a77ceb43eebf5f507d375 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 Jan 2016 21:32:09 -0800 Subject: [PATCH 107/705] rcutorture: Avoid RCU CPU stall warning and RT throttling Running rcuperf can result in RCU CPU stall warnings and RT throttling. These occur because on of the real-time writer processes does ftrace_dump() while still running at real-time priority. This commit therefore prevents these problems by setting the writer thread back to SCHED_NORMAL (AKA SCHED_OTHER) before doing ftrace_dump(). In addition, this commit adds a small fixed delay before dumping ftrace buffer in order to decrease the probability that this dumping will interfere with other writers' grace periods. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 278600143bb6d..4c0572859ff0f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -404,11 +404,15 @@ rcu_perf_writer(void *arg) started = true; if (!done && i >= MIN_MEAS) { done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); pr_alert("%s" PERF_FLAG "rcu_perf_writer %ld has %d measurements\n", perf_type, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { + schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; -- GitLab From dba6f1bab8920a6f78b0dc21976afdecf82fba3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Feb 2016 16:39:38 -0800 Subject: [PATCH 108/705] rcutorture: Add largish-system rcuperf scenario This commit adds an rcuperf scenario named TREE54 that uses 54 CPUs and provides a four-level rcu_node combining tree. Signed-off-by: Paul E. McKenney --- .../rcutorture/configs/rcuperf/TREE54 | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 new file mode 100644 index 0000000000000..985fb170d13c1 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 @@ -0,0 +1,23 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=54 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_FANOUT=3 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y -- GitLab From e6fb1fc1085e5b5155bc8f3d3385c48b8bdde95e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Sun, 7 Feb 2016 13:31:39 +0100 Subject: [PATCH 109/705] rcuperf: Do not wake up shutdown wait queue if "shutdown" is false. After finishing its tests rcuperf tries to wake up shutdown_wq even if "shutdown" param is set to false, resulting in a wake_up() call on an unitialized wait_queue_head_t which leads to "BUG: spinlock bad magic" and "BUG: unable to handle kernel NULL pointer dereference". Fix by checking "shutdown" param before waking up the queue. Signed-off-by: Artem Savkov --- kernel/rcu/rcuperf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 4c0572859ff0f..3cee0d8393ed8 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -423,8 +423,10 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_finished = cur_ops->completed(); } - smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } } } if (done && !alldone && -- GitLab From 67522beecfc75d133514dda64107ee19125a74b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Mar 2016 08:52:19 -0800 Subject: [PATCH 110/705] rcutorture: Remove redundant initialization to zero The current code initializes the global per-CPU variables rcu_torture_count and rcu_torture_batch to zero. However, C does this initialization by default, and explicit initialization of per-CPU variables now needs a different syntax if "make tags" is to work. This commit therefore removes the initialization. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9234e75b106ae..52b49fe90919b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; -- GitLab From de26ca19a530d2d822a6816834d22022e94b2e53 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 17 Mar 2016 11:14:35 +0100 Subject: [PATCH 111/705] rcutorture: Consider FROZEN hotplug notifier transitions The hotplug notifier rcutorture_cpu_notify() doesn't consider the corresponding CPU_XXX_FROZEN transitions. They occur on suspend/resume and are usually handled the same way as the corresponding non frozen transitions. Mask the switch case action argument with '~CPU_TASKS_FROZEN' to map CPU_XXX_FROZEN hotplug transitions on corresponding non-frozen transitions. Cc: Josh Triplett Cc: "Paul E. McKenney" Signed-off-by: Anna-Maria Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 52b49fe90919b..633a68a094402 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1585,7 +1585,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); -- GitLab From 9eb5188a0704bd21eb7e4aef83b904fad43d3ec8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Mar 2016 15:36:40 -0700 Subject: [PATCH 112/705] torture: Clarify refusal to run more than one torture test This commit clarifies error messages -- you only get to run one torture test at a time! Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f7..e912ccd960f0c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -602,8 +602,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } -- GitLab From fb2c66af10f92bc83659c4d8a32e02287f0e5dda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2016 14:44:42 -0700 Subject: [PATCH 113/705] torture: Kill qemu, not parent process The current hang-check machinery in the rcutorture scripts uses "$!" of a parenthesized bash statement to capture the pid. Unfortunately, this captures not qemu's pid, but rather that of its parent that implements the parenthesized statement. This commit therefore adjusts things so as to capture qemu's actual pid, which then allows the script to actually kill qemu in event of a kernel hang. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0f80eefb0bfd5..2eb8fefbe7d9e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -168,14 +168,25 @@ then fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & -qemu_pid=$! +( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 -echo Monitoring qemu job at pid $qemu_pid +sleep 10 # Give qemu's pid a chance to reach the file +if test -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid +else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid +fi while : do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then if test $kruntime -ge $seconds then @@ -195,12 +206,16 @@ do ps -fp $killpid >> $resdir/Warnings 2>&1 fi else - echo ' ---' `date`: Kernel done + echo ' ---' `date`: "Kernel done" fi break fi done -if test $commandcompleted -eq 0 +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" then echo Grace period for qemu job at pid $qemu_pid while : @@ -220,6 +235,9 @@ then fi sleep 1 done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command fi parse-torture.sh $resdir/console.log $title -- GitLab From 480b1eb659f65be8ed039f1a9db3f762c41c9770 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2016 10:50:38 -0700 Subject: [PATCH 114/705] rcutorture: Convert test duration to seconds early This commit converts test duration from minutes to seconds early on in order to prepare for upcoming OS-jitter-injection changes. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 5 ++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 2eb8fefbe7d9e..73a2656684212 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -6,7 +6,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args # # qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with # arguments specifying the number of CPUs and other @@ -123,8 +123,7 @@ while test -f $builddir.ready do sleep 1 done -minutes=$4 -seconds=$(($minutes * 60)) +seconds=$4 qemu_args=$5 boot_args=$6 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index c33cb582b3dcb..704e219f67a7a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T -dur=30 +dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH @@ -116,7 +116,7 @@ do ;; --duration) checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$2 + dur=$(($2*60)) shift ;; --interactive) -- GitLab From 2ed94f6fde066fb37bc3553b786edb805561699e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 31 Mar 2016 10:18:09 -0700 Subject: [PATCH 115/705] regmap: mmio: Explicitly say little endian is the defualt in the bus config Otherwise the DT parsing will default to big endian if nothing is specified. Reported-by: Krzysztof Kozlowski Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-mmio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 7526906ca080f..1aa26aa31f6ad 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -212,6 +212,7 @@ static const struct regmap_bus regmap_mmio = { .reg_write = regmap_mmio_write, .reg_read = regmap_mmio_read, .free_context = regmap_mmio_free_context, + .val_format_endian_default = REGMAP_ENDIAN_LITTLE, }; static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, -- GitLab From d18d12d0ff07c47fb913f297c174f30a3f96042d Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Thu, 31 Mar 2016 15:51:32 +0200 Subject: [PATCH 116/705] lib/proportions: Remove unused code By accident I stumbled across code that is no longer used. According to git grep, the global functions in lib/proportions.c are not used anywhere. This patch removes the old, unused code. Peter Zijlstra further commented: "Ah indeed, that got replaced with the flex proportion code a while back." Signed-off-by: Richard Cochran Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4265b49bed713fbe3faaf8c05da0e1792f09c0b3.1459432020.git.rcochran@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/proportions.h | 137 ------------ include/linux/sched.h | 1 - lib/Makefile | 2 +- lib/proportions.c | 407 ------------------------------------ 4 files changed, 1 insertion(+), 546 deletions(-) delete mode 100644 include/linux/proportions.h delete mode 100644 lib/proportions.c diff --git a/include/linux/proportions.h b/include/linux/proportions.h deleted file mode 100644 index 21221338ad180..0000000000000 --- a/include/linux/proportions.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * FLoating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * This file contains the public data structure and API definitions. - */ - -#ifndef _LINUX_PROPORTIONS_H -#define _LINUX_PROPORTIONS_H - -#include -#include -#include -#include - -struct prop_global { - /* - * The period over which we differentiate - * - * period = 2^shift - */ - int shift; - /* - * The total event counter aka 'time'. - * - * Treated as an unsigned long; the lower 'shift - 1' bits are the - * counter bits, the remaining upper bits the period counter. - */ - struct percpu_counter events; -}; - -/* - * global proportion descriptor - * - * this is needed to consistently flip prop_global structures. - */ -struct prop_descriptor { - int index; - struct prop_global pg[2]; - struct mutex mutex; /* serialize the prop_global switch */ -}; - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); -void prop_change_shift(struct prop_descriptor *pd, int new_shift); - -/* - * ----- PERCPU ------ - */ - -struct prop_local_percpu { - /* - * the local events counter - */ - struct percpu_counter events; - - /* - * snapshot of the last seen global state - */ - int shift; - unsigned long period; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); -void prop_local_destroy_percpu(struct prop_local_percpu *pl); -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); -void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_percpu(pd, pl); - local_irq_restore(flags); -} - -/* - * Limit the time part in order to ensure there are some bits left for the - * cycle counter and fraction multiply. - */ -#if BITS_PER_LONG == 32 -#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) -#else -#define PROP_MAX_SHIFT (BITS_PER_LONG/2) -#endif - -#define PROP_FRAC_SHIFT (BITS_PER_LONG - PROP_MAX_SHIFT - 1) -#define PROP_FRAC_BASE (1UL << PROP_FRAC_SHIFT) - -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac); - - -/* - * ----- SINGLE ------ - */ - -struct prop_local_single { - /* - * the local events counter - */ - unsigned long events; - - /* - * snapshot of the last seen global state - * and a lock protecting this state - */ - unsigned long period; - int shift; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -#define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ -} - -int prop_local_init_single(struct prop_local_single *pl); -void prop_local_destroy_single(struct prop_local_single *pl); -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl); -void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_single(pd, pl); - local_irq_restore(flags); -} - -#endif /* _LINUX_PROPORTIONS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 60bba7e032dc3..6dd25d1869b58 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -40,7 +40,6 @@ struct sched_param { #include #include #include -#include #include #include #include diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd436c97a..a65e9a8615355 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ - proportions.o flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o diff --git a/lib/proportions.c b/lib/proportions.c deleted file mode 100644 index efa54f259ea9d..0000000000000 --- a/lib/proportions.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Floating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Description: - * - * The floating proportion is a time derivative with an exponentially decaying - * history: - * - * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) - * - * Where j is an element from {prop_local}, x_{j} is j's number of events, - * and i the time period over which the differential is taken. So d/dt_{-i} is - * the differential over the i-th last period. - * - * The decaying history gives smooth transitions. The time differential carries - * the notion of speed. - * - * The denominator is 2^(1+i) because we want the series to be normalised, ie. - * - * \Sum_{i=0} 1/2^(1+i) = 1 - * - * Further more, if we measure time (t) in the same events as x; so that: - * - * t = \Sum_{j} x_{j} - * - * we get that: - * - * \Sum_{j} p_{j} = 1 - * - * Writing this in an iterative fashion we get (dropping the 'd's): - * - * if (++x_{j}, ++t > period) - * t /= 2; - * for_each (j) - * x_{j} /= 2; - * - * so that: - * - * p_{j} = x_{j} / t; - * - * We optimize away the '/= 2' for the global time delta by noting that: - * - * if (++t > period) t /= 2: - * - * Can be approximated by: - * - * period/2 + (++t % period/2) - * - * [ Furthermore, when we choose period to be 2^n it can be written in terms of - * binary operations and wraparound artefacts disappear. ] - * - * Also note that this yields a natural counter of the elapsed periods: - * - * c = t / (period/2) - * - * [ Its monotonic increasing property can be applied to mitigate the wrap- - * around issue. ] - * - * This allows us to do away with the loop over all prop_locals on each period - * expiration. By remembering the period count under which it was last accessed - * as c_{j}, we can obtain the number of 'missed' cycles from: - * - * c - c_{j} - * - * We can then lazily catch up to the global period count every time we are - * going to use x_{j}, by doing: - * - * x_{j} /= 2^(c - c_{j}), c_{j} = c - */ - -#include -#include - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) -{ - int err; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - pd->index = 0; - pd->pg[0].shift = shift; - mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, gfp); - if (err) - goto out; - - err = percpu_counter_init(&pd->pg[1].events, 0, gfp); - if (err) - percpu_counter_destroy(&pd->pg[0].events); - -out: - return err; -} - -/* - * We have two copies, and flip between them to make it seem like an atomic - * update. The update is not really atomic wrt the events counter, but - * it is internally consistent with the bit layout depending on shift. - * - * We copy the events count, move the bits around and flip the index. - */ -void prop_change_shift(struct prop_descriptor *pd, int shift) -{ - int index; - int offset; - u64 events; - unsigned long flags; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - mutex_lock(&pd->mutex); - - index = pd->index ^ 1; - offset = pd->pg[pd->index].shift - shift; - if (!offset) - goto out; - - pd->pg[index].shift = shift; - - local_irq_save(flags); - events = percpu_counter_sum(&pd->pg[pd->index].events); - if (offset < 0) - events <<= -offset; - else - events >>= offset; - percpu_counter_set(&pd->pg[index].events, events); - - /* - * ensure the new pg is fully written before the switch - */ - smp_wmb(); - pd->index = index; - local_irq_restore(flags); - - synchronize_rcu(); - -out: - mutex_unlock(&pd->mutex); -} - -/* - * wrap the access to the data in an rcu_read_lock() section; - * this is used to track the active references. - */ -static struct prop_global *prop_get_global(struct prop_descriptor *pd) -__acquires(RCU) -{ - int index; - - rcu_read_lock(); - index = pd->index; - /* - * match the wmb from vcd_flip() - */ - smp_rmb(); - return &pd->pg[index]; -} - -static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) -__releases(RCU) -{ - rcu_read_unlock(); -} - -static void -prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) -{ - int offset = *pl_shift - new_shift; - - if (!offset) - return; - - if (offset < 0) - *pl_period <<= -offset; - else - *pl_period >>= offset; - - *pl_shift = new_shift; -} - -/* - * PERCPU - */ - -#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - return percpu_counter_init(&pl->events, 0, gfp); -} - -void prop_local_destroy_percpu(struct prop_local_percpu *pl) -{ - percpu_counter_destroy(&pl->events); -} - -/* - * Catch up with missed period expirations. - * - * until (c_{j} == c) - * x_{j} -= x_{j}/2; - * c_{j}++; - */ -static -void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - - /* - * For each missed period, we half the local counter. - * basically: - * pl->events >> (global_period - pl->period); - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (period < BITS_PER_LONG) { - s64 val = percpu_counter_read(&pl->events); - - if (val < (nr_cpu_ids * PROP_BATCH)) - val = percpu_counter_sum(&pl->events); - - __percpu_counter_add(&pl->events, -val + (val >> period), - PROP_BATCH); - } else - percpu_counter_set(&pl->events, 0); - - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * identical to __prop_inc_percpu, except that it limits this pl's fraction to - * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded. - */ -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - - if (unlikely(frac != PROP_FRAC_BASE)) { - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - long numerator, denominator; - - numerator = percpu_counter_read_positive(&pl->events); - global_count = percpu_counter_read(&pg->events); - denominator = period_2 + (global_count & counter_mask); - - if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT)) - goto out_put; - } - - percpu_counter_add(&pl->events, 1); - percpu_counter_add(&pg->events, 1); - -out_put: - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_percpu(struct prop_descriptor *pd, - struct prop_local_percpu *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_percpu(pg, pl); - *numerator = percpu_counter_read_positive(&pl->events); - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} - -/* - * SINGLE - */ - -int prop_local_init_single(struct prop_local_single *pl) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - pl->events = 0; - return 0; -} - -void prop_local_destroy_single(struct prop_local_single *pl) -{ -} - -/* - * Catch up with missed period expirations. - */ -static -void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - /* - * For each missed period, we half the local counter. - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (likely(period < BITS_PER_LONG)) - pl->events >>= period; - else - pl->events = 0; - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_single(pg, pl); - pl->events++; - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_single(struct prop_descriptor *pd, - struct prop_local_single *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_single(pg, pl); - *numerator = pl->events; - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} -- GitLab From d7847a7017b2a2759dd5590c0cffdbdf2994918e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 Apr 2016 09:00:35 +0200 Subject: [PATCH 117/705] x86/cpufeature: Fix build bug caused by merge artifact with the removal of cpu_has_hypervisor The 0-day build robot by Fengguang Wu reported a build failure: arch/x86/events//intel/cstate.c: In function 'cstate_pmu_init': arch/x86/events//intel/cstate.c:680:6: error: 'cpu_has_hypervisor' undeclared (first use in this function) ... which was caused by a merge mistake I made when applying the following patch: 0c9f3536cc71 ("x86/cpufeature: Remove cpu_has_hypervisor") apply the missing hunk as well. Reported-by: kbuild test robot Cc: David Kershner Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sparmaintainer@unisys.com Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/1459266123-21878-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 7946c4231169f..d5045c8e2e635 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -677,7 +677,7 @@ static int __init cstate_pmu_init(void) { int err; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; err = cstate_init(); -- GitLab From 1c532e00a0c649ac6f0703e8c2e095c9c1d30625 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 31 Mar 2016 14:18:29 -0500 Subject: [PATCH 118/705] x86/platform/uv: Disable UV BAU by default For several years, the common practice has been to boot UVs with the "nobau" parameter on the command line, to disable the BAU. We've decided that it makes more sense to just disable the BAU by default in the kernel, and provide the option to turn it on, if desired. For now, having the on/off switch doesn't buy us any more than just reversing the logic would, but we're working towards having the BAU enabled by default on UV4. When those changes are in place, having the on/off switch will make more sense than an enable flag, since the default behavior will be different depending on the system version. I've also added a bit of documentation for the new parameter to Documentation/kernel-parameters.txt. Signed-off-by: Alex Thorlton Reviewed-by: Hedi Berriche Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459451909-121845-1-git-send-email-athorlton@sgi.com Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 8 +++++++ arch/x86/include/asm/uv/uv_bau.h | 2 +- arch/x86/platform/uv/tlb_uv.c | 35 ++++++++++++++++++++--------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ecc74fa4bfde8..893a70907f15b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -131,6 +131,7 @@ parameter is applicable: More X86-64 boot options can be found in Documentation/x86/x86_64/boot-options.txt . X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) + X86_UV SGI UV support is enabled. XEN Xen support is enabled In addition, the following text indicates that the option: @@ -542,6 +543,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: (must be >=0) Default: 64 + bau= [X86_UV] Enable the BAU on SGI UV. The default + behavior is to disable the BAU (i.e. bau=0). + Format: { "0" | "1" } + 0 - Disable the BAU. + 1 - Enable the BAU. + unset - Disable the BAU. + baycom_epp= [HW,AX25] Format: , diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index fc808b83fccb2..cc44d926c17e3 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -598,7 +598,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; - short nobau; + bool nobau; short baudisabled; short cpu; short osnode; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3b6ec42718e46..534ab944b9477 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -37,7 +37,7 @@ static int timeout_base_ns[] = { }; static int timeout_us; -static int nobau; +static bool nobau = true; static int nobau_perm; static cycles_t congested_cycles; @@ -106,13 +106,28 @@ static char *stat_description[] = { "enable: number times use of the BAU was re-enabled" }; -static int __init -setup_nobau(char *arg) +static int __init setup_bau(char *arg) { - nobau = 1; + int result; + + if (!arg) + return -EINVAL; + + result = strtobool(arg, &nobau); + if (result) + return result; + + /* we need to flip the logic here, so that bau=y sets nobau to false */ + nobau = !nobau; + + if (!nobau) + pr_info("UV BAU Enabled\n"); + else + pr_info("UV BAU Disabled\n"); + return 0; } -early_param("nobau", setup_nobau); +early_param("bau", setup_bau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; @@ -131,10 +146,10 @@ set_bau_on(void) pr_info("BAU not initialized; cannot be turned on\n"); return; } - nobau = 0; + nobau = false; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 0; + bcp->nobau = false; } pr_info("BAU turned on\n"); return; @@ -146,10 +161,10 @@ set_bau_off(void) int cpu; struct bau_control *bcp; - nobau = 1; + nobau = true; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 1; + bcp->nobau = true; } pr_info("BAU turned off\n"); return; @@ -1886,7 +1901,7 @@ static void __init init_per_cpu_tunables(void) bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; if (nobau) - bcp->nobau = 1; + bcp->nobau = true; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); -- GitLab From 2a28e23049af99e1c810111ef5e56455cafeda45 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 8 Mar 2016 10:38:50 +0200 Subject: [PATCH 119/705] perf jit: Add support for using TSC as a timestamp Intel PT uses TSC as a timestamp, so add support for using TSC instead of the monotonic clock. Use of TSC is selected by an environment variable "JITDUMP_USE_ARCH_TIMESTAMP" and flagged in the jitdump file with flag JITDUMP_FLAGS_ARCH_TIMESTAMP. Signed-off-by: Adrian Hunter Cc: Alexander Shishkin Cc: He Kuang Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Sukadev Bhattiprolu Cc: Wang Nan Link: http://lkml.kernel.org/r/1457426330-30226-1-git-send-email-adrian.hunter@intel.com [ Added the fixup from He Kuang to make it build on other arches, ] [ such as aarch64, to avoid inserting this bisectiong breakage upstream ] Link: http://lkml.kernel.org/r/1459482572-129494-1-git-send-email-hekuang@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/tsc.c | 1 - tools/perf/arch/x86/util/tsc.h | 17 -------------- tools/perf/jvmti/jvmti_agent.c | 43 ++++++++++++++++++++++++++++++++-- tools/perf/util/Build | 3 +-- tools/perf/util/jitdump.c | 37 +++++++++++++++++++++++++---- tools/perf/util/jitdump.h | 3 +++ tools/perf/util/tsc.h | 11 ++++++++- 7 files changed, 87 insertions(+), 28 deletions(-) delete mode 100644 tools/perf/arch/x86/util/tsc.h diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index 70ff7c14bea6a..357f1b13b5ae3 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -7,7 +7,6 @@ #include #include "../../util/debug.h" #include "../../util/tsc.h" -#include "tsc.h" int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) diff --git a/tools/perf/arch/x86/util/tsc.h b/tools/perf/arch/x86/util/tsc.h deleted file mode 100644 index 2edc4d31065c1..0000000000000 --- a/tools/perf/arch/x86/util/tsc.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ -#define TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ - -#include - -struct perf_tsc_conversion { - u16 time_shift; - u32 time_mult; - u64 time_zero; -}; - -struct perf_event_mmap_page; - -int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, - struct perf_tsc_conversion *tc); - -#endif /* TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ */ diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c index 6461e02ab940d..3573f315f9559 100644 --- a/tools/perf/jvmti/jvmti_agent.c +++ b/tools/perf/jvmti/jvmti_agent.c @@ -92,6 +92,22 @@ static int get_e_machine(struct jitheader *hdr) return ret; } +static int use_arch_timestamp; + +static inline uint64_t +get_arch_timestamp(void) +{ +#if defined(__i386__) || defined(__x86_64__) + unsigned int low, high; + + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + + return low | ((uint64_t)high) << 32; +#else + return 0; +#endif +} + #define NSEC_PER_SEC 1000000000 static int perf_clk_id = CLOCK_MONOTONIC; @@ -107,6 +123,9 @@ perf_get_timestamp(void) struct timespec ts; int ret; + if (use_arch_timestamp) + return get_arch_timestamp(); + ret = clock_gettime(perf_clk_id, &ts); if (ret) return 0; @@ -203,6 +222,17 @@ perf_close_marker_file(void) munmap(marker_addr, pgsz); } +static void +init_arch_timestamp(void) +{ + char *str = getenv("JITDUMP_USE_ARCH_TIMESTAMP"); + + if (!str || !*str || !strcmp(str, "0")) + return; + + use_arch_timestamp = 1; +} + void *jvmti_open(void) { int pad_cnt; @@ -211,11 +241,17 @@ void *jvmti_open(void) int fd; FILE *fp; + init_arch_timestamp(); + /* * check if clockid is supported */ - if (!perf_get_timestamp()) - warnx("jvmti: kernel does not support %d clock id", perf_clk_id); + if (!perf_get_timestamp()) { + if (use_arch_timestamp) + warnx("jvmti: arch timestamp not supported"); + else + warnx("jvmti: kernel does not support %d clock id", perf_clk_id); + } memset(&header, 0, sizeof(header)); @@ -263,6 +299,9 @@ void *jvmti_open(void) header.timestamp = perf_get_timestamp(); + if (use_arch_timestamp) + header.flags |= JITDUMP_FLAGS_ARCH_TIMESTAMP; + if (!fwrite(&header, sizeof(header), 1, fp)) { warn("jvmti: cannot write dumpfile header"); goto error; diff --git a/tools/perf/util/Build b/tools/perf/util/Build index da48fd843438f..85ceff357769b 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -69,8 +69,7 @@ libperf-y += stat-shadow.o libperf-y += record.o libperf-y += srcline.o libperf-y += data.o -libperf-$(CONFIG_X86) += tsc.o -libperf-$(CONFIG_AUXTRACE) += tsc.o +libperf-y += tsc.o libperf-y += cloexec.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index ad0c0bb1fbc78..52fcef3074fee 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -17,6 +17,7 @@ #include "strlist.h" #include +#include "tsc.h" #include "session.h" #include "jit.h" #include "jitdump.h" @@ -33,6 +34,7 @@ struct jit_buf_desc { size_t bufsize; FILE *in; bool needs_bswap; /* handles cross-endianess */ + bool use_arch_timestamp; void *debug_data; size_t nr_debug_entries; uint32_t code_load_count; @@ -158,13 +160,16 @@ jit_open(struct jit_buf_desc *jd, const char *name) header.flags = bswap_64(header.flags); } + jd->use_arch_timestamp = header.flags & JITDUMP_FLAGS_ARCH_TIMESTAMP; + if (verbose > 2) - pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n", + pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\nuse_arch_timestamp=%d\n", header.version, header.total_size, (unsigned long long)header.timestamp, header.pid, - header.elf_mach); + header.elf_mach, + jd->use_arch_timestamp); if (header.flags & JITDUMP_FLAGS_RESERVED) { pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n", @@ -172,10 +177,15 @@ jit_open(struct jit_buf_desc *jd, const char *name) goto error; } + if (jd->use_arch_timestamp && !jd->session->time_conv.time_mult) { + pr_err("jitdump file uses arch timestamps but there is no timestamp conversion\n"); + goto error; + } + /* * validate event is using the correct clockid */ - if (jit_validate_events(jd->session)) { + if (!jd->use_arch_timestamp && jit_validate_events(jd->session)) { pr_err("error, jitted code must be sampled with perf record -k 1\n"); goto error; } @@ -329,6 +339,23 @@ jit_inject_event(struct jit_buf_desc *jd, union perf_event *event) return 0; } +static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp) +{ + struct perf_tsc_conversion tc; + + if (!jd->use_arch_timestamp) + return timestamp; + + tc.time_shift = jd->session->time_conv.time_shift; + tc.time_mult = jd->session->time_conv.time_mult; + tc.time_zero = jd->session->time_conv.time_zero; + + if (!tc.time_mult) + return 0; + + return tsc_to_perf_time(timestamp, &tc); +} + static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) { struct perf_sample sample; @@ -410,7 +437,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) id->tid = tid; } if (jd->sample_type & PERF_SAMPLE_TIME) - id->time = jr->load.p.timestamp; + id->time = convert_timestamp(jd, jr->load.p.timestamp); /* * create pseudo sample to induce dso hit increment @@ -499,7 +526,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) id->tid = tid; } if (jd->sample_type & PERF_SAMPLE_TIME) - id->time = jr->load.p.timestamp; + id->time = convert_timestamp(jd, jr->load.p.timestamp); /* * create pseudo sample to induce dso hit increment diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h index b66c1f503d9ed..bcacd20d0c1c7 100644 --- a/tools/perf/util/jitdump.h +++ b/tools/perf/util/jitdump.h @@ -23,9 +23,12 @@ #define JITHEADER_VERSION 1 enum jitdump_flags_bits { + JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT, JITDUMP_FLAGS_MAX_BIT, }; +#define JITDUMP_FLAGS_ARCH_TIMESTAMP (1ULL << JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT) + #define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \ (~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0) diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h index 280ddc067556a..d5b11e2b85e05 100644 --- a/tools/perf/util/tsc.h +++ b/tools/perf/util/tsc.h @@ -4,7 +4,16 @@ #include #include "event.h" -#include "../arch/x86/util/tsc.h" + +struct perf_tsc_conversion { + u16 time_shift; + u32 time_mult; + u64 time_zero; +}; +struct perf_event_mmap_page; + +int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, + struct perf_tsc_conversion *tc); u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc); u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc); -- GitLab From bd0c7a54219cc3745ce7f36970d8e5ffb3f8d80e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 8 Mar 2016 10:38:53 +0200 Subject: [PATCH 120/705] perf intel-pt/bts: Define JITDUMP_USE_ARCH_TIMESTAMP For Intel PT / BTS, define the environment variable that selects TSC timestamps in the jitdump file. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1457426333-30260-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/intel-bts.c | 5 +++++ tools/perf/arch/x86/util/intel-pt.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index d66f9ad4df2ea..7dc30637cf66f 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -438,6 +438,11 @@ struct auxtrace_record *intel_bts_recording_init(int *err) if (!intel_bts_pmu) return NULL; + if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) { + *err = -errno; + return NULL; + } + btsr = zalloc(sizeof(struct intel_bts_recording)); if (!btsr) { *err = -ENOMEM; diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index a3395179c9eeb..a07b9605e93b3 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -1027,6 +1027,11 @@ struct auxtrace_record *intel_pt_recording_init(int *err) if (!intel_pt_pmu) return NULL; + if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) { + *err = -errno; + return NULL; + } + ptr = zalloc(sizeof(struct intel_pt_recording)); if (!ptr) { *err = -ENOMEM; -- GitLab From ac0e2cd555373ae6f8f3a3ad3fbbf5b6d1e7aaaa Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 30 Mar 2016 12:16:15 -0700 Subject: [PATCH 121/705] perf tools: Fix PMU term format max value calculation Currently the max value of format is calculated by the bits number. It relies on the continuity of the format. However, uncore event format is not continuous. E.g. uncore qpi event format can be 0-7,21. If bit 21 is set, there is parsing issues as below. $ perf stat -a -e uncore_qpi_0/event=0x200002,umask=0x8/ event syntax error: '..pi_0/event=0x200002,umask=0x8/' \___ value too big for format, maximum is 511 This patch return the real max value by setting all possible bits to 1. Signed-off-by: Kan Liang Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1459365375-14285-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index adef23b1352e8..bf34468a99cbc 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -602,14 +602,13 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, static __u64 pmu_format_max_value(const unsigned long *format) { - int w; + __u64 w = 0; + int fbit; - w = bitmap_weight(format, PERF_PMU_FORMAT_BITS); - if (!w) - return 0; - if (w < 64) - return (1ULL << w) - 1; - return -1; + for_each_set_bit(fbit, format, PERF_PMU_FORMAT_BITS) + w |= (1ULL << fbit); + + return w; } /* -- GitLab From e6001980c61b45ef090e2b4c9c1953ef897cdeb0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Mar 2016 15:16:28 -0300 Subject: [PATCH 122/705] perf trace: Introduce function to set the base timestamp That is used in both live runs, i.e.: # trace ls As when processing events recorded in a perf.data file: # trace -i perf.data Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-901l6yebnzeqg7z8mbaf49xb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c45c1cfeb866d..99daeed55a9b8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2400,6 +2400,14 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample) return false; } +static void trace__set_base_time(struct trace *trace, + struct perf_evsel *evsel __maybe_unused, + struct perf_sample *sample) +{ + if (trace->base_time == 0 && !trace->full_time) + trace->base_time = sample->time; +} + static int trace__process_sample(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -2414,8 +2422,7 @@ static int trace__process_sample(struct perf_tool *tool, if (skip_sample(trace, sample)) return 0; - if (!trace->full_time && trace->base_time == 0) - trace->base_time = sample->time; + trace__set_base_time(trace, evsel, sample); if (handler) { ++trace->nr_events; @@ -2553,9 +2560,6 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st const u32 type = event->header.type; struct perf_evsel *evsel; - if (!trace->full_time && trace->base_time == 0) - trace->base_time = sample->time; - if (type != PERF_RECORD_SAMPLE) { trace__process_event(trace, trace->host, event, sample); return; @@ -2567,6 +2571,8 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st return; } + trace__set_base_time(trace, evsel, sample); + if (evsel->attr.type == PERF_TYPE_TRACEPOINT && sample->raw_data == NULL) { fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", -- GitLab From 8a07a8094b6b6c3e195885ec31f4bd0be54aafaf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Mar 2016 15:19:39 -0300 Subject: [PATCH 123/705] perf trace: Don't set the base timestamp using events without PERF_SAMPLE_TIME This was causing bogus values to be shown at the timestamp column: Before: # trace --ev bpf-output/no-inherit,name=evt/ --ev /home/acme/bpf/test_bpf_trace.c/map:channel.event=evt/ usleep 10 94631143.385 ( 0.001 ms): brk( ) = 0x555555757000 94631143.398 ( 0.003 ms): mmap(len: 4096, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS, fd: -1) = 0x7ffff7ff6000 94631143.406 ( 0.004 ms): access(filename: 0xf7df9e10, mode: R ) = -1 ENOENT No such file or directory 94631143.412 ( 0.004 ms): open(filename: 0xf7df8761, flags: CLOEXEC) = 3 94631143.415 ( 0.002 ms): fstat(fd: 3, statbuf: 0x7fffffffd6b0 ) = 0 94631143.419 ( 0.003 ms): mmap(len: 106798, prot: READ, flags: PRIVATE, fd: 3) = 0x7ffff7fdb000 94631143.420 ( 0.001 ms): close(fd: 3 ) = 0 94631143.432 ( 0.004 ms): open(filename: 0xf7ff6640, flags: CLOEXEC) = 3 After: # trace --ev bpf-output/no-inherit,name=evt/ --ev /home/acme/bpf/test_bpf_trace.c/map:channel.event=evt/ usleep 10 0.022 ( 0.001 ms): brk( ) = 0x55d7668a6000 0.037 ( 0.003 ms): mmap(len: 4096, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS, fd: -1) = 0x7f8fbeb97000 0.123 ( 0.083 ms): access(filename: 0xbe995e10, mode: R ) = -1 ENOENT No such file or directory 0.130 ( 0.004 ms): open(filename: 0xbe994761, flags: CLOEXEC) = 3 0.133 ( 0.002 ms): fstat(fd: 3, statbuf: 0x7fff6487a890 ) = 0 0.138 ( 0.003 ms): mmap(len: 106798, prot: READ, flags: PRIVATE, fd: 3) = 0x7f8fbeb7c000 0.140 ( 0.001 ms): close(fd: 3 ) = 0 0.151 ( 0.004 ms): open(filename: 0xbeb97640, flags: CLOEXEC) = 3 Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-p7m8llv81iv55ekxexdp5n57@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 99daeed55a9b8..d309f4535a45a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2401,10 +2401,19 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample) } static void trace__set_base_time(struct trace *trace, - struct perf_evsel *evsel __maybe_unused, + struct perf_evsel *evsel, struct perf_sample *sample) { - if (trace->base_time == 0 && !trace->full_time) + /* + * BPF events were not setting PERF_SAMPLE_TIME, so be more robust + * and don't use sample->time unconditionally, we may end up having + * some other event in the future without PERF_SAMPLE_TIME for good + * reason, i.e. we may not be interested in its timestamps, just in + * it taking place, picking some piece of information when it + * appears in our event stream (vfs_getname comes to mind). + */ + if (trace->base_time == 0 && !trace->full_time && + (evsel->attr.sample_type & PERF_SAMPLE_TIME)) trace->base_time = sample->time; } -- GitLab From d37ba880598654fda10b312331377cdca3edd574 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 1 Apr 2016 13:26:42 +0000 Subject: [PATCH 124/705] perf bpf: Add sample types for 'bpf-output' event Before this patch we can see very large time in the events before the 'bpf-output' event. For example: # perf trace -vv -T --ev sched:sched_switch \ --ev bpf-output/no-inherit,name=evt/ \ --ev ./test_bpf_trace.c/map:channel.event=evt/ \ usleep 10 ... 18446744073709.551 (18446564645918.480 ms): usleep/4157 nanosleep(rqtp: 0x7ffd3f0dc4e0) ... 18446744073709.551 ( ): evt:Raise a BPF event!..) 179427791.076 ( ): perf_bpf_probe:func_begin:(ffffffff810eb9a0)) 179427791.081 ( ): sched:sched_switch:usleep:4157 [120] S ==> swapper/2:0 [120]) ... We can also see the differences between bpf-output events and breakpoint events: For bpf output event: sample_type IP|TID|RAW|IDENTIFIER For tracepoint events: sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER This patch fix this differences by adding more sample type for bpf-output events. After this patch: # perf trace -vv -T --ev sched:sched_switch \ --ev bpf-output/no-inherit,name=evt/ \ --ev ./test_bpf_trace.c/map:channel.event=evt/ \ usleep 10 ... 179877370.878 ( 0.003 ms): usleep/5336 nanosleep(rqtp: 0x7ffff866c450) ... 179877370.878 ( ): evt:Raise a BPF event!..) 179877370.878 ( ): perf_bpf_probe:func_begin:(ffffffff810eb9a0)) 179877370.882 ( ): sched:sched_switch:usleep:5336 [120] S ==> swapper/4:0 [120]) 179877370.945 ( ): evt:Raise a BPF event!..) ... # ./perf trace -vv -T --ev sched:sched_switch \ --ev bpf-output/no-inherit,name=evt/ \ --ev ./test_bpf_trace.c/map:channel.event=evt/ \ usleep 10 2>&1 | grep sample_type sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW sample_type IP|TID|TIME|ID|CPU|PERIOD|RAW The 'IDENTIFIER' info is not required because all events have the same sample_type. Committer notes: Further testing, on top of the changes making 'perf trace' avoid samples from events without PERF_SAMPLE_TIME: Before: # trace --ev bpf-output/no-inherit,name=evt/ --ev /home/acme/bpf/test_bpf_trace.c/map:channel.event=evt/ usleep 10 0.560 ( 0.001 ms): brk( ) = 0x55e5a1df8000 18446640227439.430 (18446640227438.859 ms): nanosleep(rqtp: 0x7ffc96643370) ... 18446640227439.430 ( ): evt:Raise a BPF event!..) 0.576 ( ): perf_bpf_probe:func_begin:(ffffffff81112460)) 18446640227439.430 ( ): evt:Raise a BPF event!..) 0.645 ( ): perf_bpf_probe:func_end:(ffffffff81112460 <- ffffffff81003d92)) 0.646 ( 0.076 ms): ... [continued]: nanosleep()) = 0 # After: # trace --ev bpf-output/no-inherit,name=evt/ --ev /home/acme/bpf/test_bpf_trace.c/map:channel.event=evt/ usleep 10 0.292 ( 0.001 ms): brk( ) = 0x55c7cd6e1000 0.302 ( 0.004 ms): nanosleep(rqtp: 0x7ffedd8bc0f0) ... 0.302 ( ): evt:Raise a BPF event!..) 0.303 ( ): perf_bpf_probe:func_begin:(ffffffff81112460)) 0.397 ( ): evt:Raise a BPF event!..) 0.397 ( ): perf_bpf_probe:func_end:(ffffffff81112460 <- ffffffff81003d92)) 0.398 ( 0.100 ms): ... [continued]: nanosleep()) = 0 Signed-off-by: Wang Nan Reported-and-Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1459517202-42320-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 738ce226002b8..3fd7c2c72f4ad 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -226,7 +226,8 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) perf_evsel__init(evsel, attr, idx); if (perf_evsel__is_bpf_output(evsel)) { - evsel->attr.sample_type |= PERF_SAMPLE_RAW; + evsel->attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | + PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD), evsel->attr.sample_period = 1; } -- GitLab From 03cc0789a690eb9ab07070376252961caeae7441 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Apr 2016 14:56:58 -0400 Subject: [PATCH 125/705] do_splice_to(): cap the size before passing to ->splice_read() pipe capacity won't exceed 2G anyway. Signed-off-by: Al Viro --- fs/splice.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/splice.c b/fs/splice.c index 9947b5c696649..a6b87b7e07455 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; + if (unlikely(len > MAX_RW_COUNT)) + len = MAX_RW_COUNT; + if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else -- GitLab From d8e28654f28de74951ab1b7e59d2bebb442972aa Mon Sep 17 00:00:00 2001 From: Vinson Lee Date: Mon, 4 Apr 2016 22:07:39 +0000 Subject: [PATCH 126/705] perf config: Fix build with older toolchain. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build error on Ubuntu 12.04.5 with GCC 4.6.3. CC util/config.o util/config.c: In function ‘perf_buildid_config’: util/config.c:384:15: error: declaration of ‘dirname’ shadows a global declaration [-Werror=shadow] Signed-off-by: Vinson Lee Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Taeung Song Cc: Wang Nan Fixes: 9cb5987c8227 ("perf config: Rework buildid_dir_command_config to perf_buildid_config") Link: http://lkml.kernel.org/r/1459807659-9020-1-git-send-email-vlee@freedesktop.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 5c20d783423be..664490b8b327c 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -381,11 +381,11 @@ static int perf_buildid_config(const char *var, const char *value) { /* same dir for all commands */ if (!strcmp(var, "buildid.dir")) { - const char *dirname = perf_config_dirname(var, value); + const char *dir = perf_config_dirname(var, value); - if (!dirname) + if (!dir) return -1; - strncpy(buildid_dir, dirname, MAXPATHLEN-1); + strncpy(buildid_dir, dir, MAXPATHLEN-1); buildid_dir[MAXPATHLEN-1] = '\0'; } -- GitLab From bd0419e2a5a9fd9396cb7dc69044f961f52e19f0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 5 Apr 2016 11:33:41 -0300 Subject: [PATCH 127/705] perf probe: Check if dwarf_getlocations() is available If not, tell the user that: config/Makefile:273: Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157 And return -ENOTSUPP in die_get_var_range(), failing features that need it, like the one pointed out above. This fixes the build on older systems, such as Ubuntu 12.04.5. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Vinson Lee Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-9l7luqkq4gfnx7vrklkq4obs@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 2 ++ tools/build/feature/Makefile | 4 ++++ tools/build/feature/test-all.c | 5 +++++ tools/build/feature/test-dwarf_getlocations.c | 12 ++++++++++++ tools/perf/config/Makefile | 6 ++++++ tools/perf/util/dwarf-aux.c | 9 +++++++++ 6 files changed, 38 insertions(+) create mode 100644 tools/build/feature/test-dwarf_getlocations.c diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 6b7707270aa3b..9f878619077ae 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -30,6 +30,7 @@ endef FEATURE_TESTS_BASIC := \ backtrace \ dwarf \ + dwarf_getlocations \ fortify-source \ sync-compare-and-swap \ glibc \ @@ -78,6 +79,7 @@ endif FEATURE_DISPLAY ?= \ dwarf \ + dwarf_getlocations \ glibc \ gtk2 \ libaudit \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index c5f4c417428d7..4ae94dbfdab98 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -3,6 +3,7 @@ FILES= \ test-backtrace.bin \ test-bionic.bin \ test-dwarf.bin \ + test-dwarf_getlocations.bin \ test-fortify-source.bin \ test-sync-compare-and-swap.bin \ test-glibc.bin \ @@ -82,6 +83,9 @@ endif $(OUTPUT)test-dwarf.bin: $(BUILD) $(DWARFLIBS) +$(OUTPUT)test-dwarf_getlocations.bin: + $(BUILD) $(DWARFLIBS) + $(OUTPUT)test-libelf-mmap.bin: $(BUILD) -lelf diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index e499a36c1e4a9..a282e8cb84f30 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -41,6 +41,10 @@ # include "test-dwarf.c" #undef main +#define main main_test_dwarf_getlocations +# include "test-dwarf_getlocations.c" +#undef main + #define main main_test_libelf_getphdrnum # include "test-libelf-getphdrnum.c" #undef main @@ -143,6 +147,7 @@ int main(int argc, char *argv[]) main_test_libelf_mmap(); main_test_glibc(); main_test_dwarf(); + main_test_dwarf_getlocations(); main_test_libelf_getphdrnum(); main_test_libunwind(); main_test_libaudit(); diff --git a/tools/build/feature/test-dwarf_getlocations.c b/tools/build/feature/test-dwarf_getlocations.c new file mode 100644 index 0000000000000..70162699dd434 --- /dev/null +++ b/tools/build/feature/test-dwarf_getlocations.c @@ -0,0 +1,12 @@ +#include +#include + +int main(void) +{ + Dwarf_Addr base, start, end; + Dwarf_Attribute attr; + Dwarf_Op *op; + size_t nops; + ptrdiff_t offset = 0; + return (int)dwarf_getlocations(&attr, offset, &base, &start, &end, &op, &nops); +} diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index f7d7f5a1cad53..6f8f6430f2bf6 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -268,6 +268,12 @@ else ifneq ($(feature-dwarf), 1) msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); NO_DWARF := 1 + else + ifneq ($(feature-dwarf_getlocations), 1) + msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157); + else + CFLAGS += -DHAVE_DWARF_GETLOCATIONS + endif # dwarf_getlocations endif # Dwarf support endif # libelf support endif # NO_LIBELF diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 577e600c8eb15..aea189b41cc8c 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -959,6 +959,7 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf) return 0; } +#ifdef HAVE_DWARF_GETLOCATIONS /** * die_get_var_innermost_scope - Get innermost scope range of given variable DIE * @sp_die: a subprogram DIE @@ -1080,3 +1081,11 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf) return ret; } +#else +int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, + Dwarf_Die *vr_die __maybe_unused, + struct strbuf *buf __maybe_unused) +{ + return -ENOTSUP; +} +#endif -- GitLab From 76e20522b709f3772e415d70b108028454a86ad5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 5 Apr 2016 12:21:44 -0300 Subject: [PATCH 128/705] perf script perl: Do error checking on new backtrace routine This ended up triggering these warnings when building on Ubuntu 12.04.5: util/scripting-engines/trace-event-perl.c: In function 'perl_process_callchain': util/scripting-engines/trace-event-perl.c:293:4: error: value computed is not used [-Werror=unused-value] util/scripting-engines/trace-event-perl.c:294:4: error: value computed is not used [-Werror=unused-value] util/scripting-engines/trace-event-perl.c:295:4: error: value computed is not used [-Werror=unused-value] util/scripting-engines/trace-event-perl.c:297:4: error: value computed is not used [-Werror=unused-value] util/scripting-engines/trace-event-perl.c:309:4: error: value computed is not used [-Werror=unused-value] cc1: all warnings being treated as errors mv: cannot stat `/tmp/build/perf/util/scripting-engines/.trace-event-perl.o.tmp': No such file or directory make[4]: *** [/tmp/build/perf/util/scripting-engines/trace-event-perl.o] Error 1 Fix it by doing error checking when building the perl data structures related to callchains. Cc: Adrian Hunter Cc: David Ahern Cc: Dima Kogan Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Fixes: f7380c12ec6c ("perf script perl: Perl scripts now get a backtrace, like the python ones") Signed-off-by: Arnaldo Carvalho de Melo --- .../util/scripting-engines/trace-event-perl.c | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 1d160855cda92..35ed00a600fbe 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -283,18 +283,27 @@ static SV *perl_process_callchain(struct perf_sample *sample, if (!elem) goto exit; - hv_stores(elem, "ip", newSVuv(node->ip)); + if (!hv_stores(elem, "ip", newSVuv(node->ip))) { + hv_undef(elem); + goto exit; + } if (node->sym) { HV *sym = newHV(); - if (!sym) + if (!sym) { + hv_undef(elem); + goto exit; + } + if (!hv_stores(sym, "start", newSVuv(node->sym->start)) || + !hv_stores(sym, "end", newSVuv(node->sym->end)) || + !hv_stores(sym, "binding", newSVuv(node->sym->binding)) || + !hv_stores(sym, "name", newSVpvn(node->sym->name, + node->sym->namelen)) || + !hv_stores(elem, "sym", newRV_noinc((SV*)sym))) { + hv_undef(sym); + hv_undef(elem); goto exit; - hv_stores(sym, "start", newSVuv(node->sym->start)); - hv_stores(sym, "end", newSVuv(node->sym->end)); - hv_stores(sym, "binding", newSVuv(node->sym->binding)); - hv_stores(sym, "name", newSVpvn(node->sym->name, - node->sym->namelen)); - hv_stores(elem, "sym", newRV_noinc((SV*)sym)); + } } if (node->map) { @@ -306,7 +315,10 @@ static SV *perl_process_callchain(struct perf_sample *sample, else if (map->dso->name) dsoname = map->dso->name; } - hv_stores(elem, "dso", newSVpv(dsoname,0)); + if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) { + hv_undef(elem); + goto exit; + } } callchain_cursor_advance(&callchain_cursor); -- GitLab From 860b69f1d533de39fa70784768008d0eaf242e5c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 5 Apr 2016 17:01:52 +0200 Subject: [PATCH 129/705] perf tools: Remove superfluous ARCH Makefile includes Link: http://lkml.kernel.org/n/tip-yk6brsq3opuotr9by18xlkr8@git.kernel.org Signed-off-by: Jiri Olsa --- tools/perf/Makefile.perf | 2 -- tools/perf/config/Makefile | 3 --- 2 files changed, 5 deletions(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 000ea210389d3..58aed81a21ea9 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -297,8 +297,6 @@ endif # because maintaining the nesting to match is a pain. If # we had "elif" things would have been much nicer... --include arch/$(ARCH)/Makefile - ifneq ($(OUTPUT),) CFLAGS += -I$(OUTPUT) endif diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 6f8f6430f2bf6..d1e2b856ef0fd 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -295,9 +295,6 @@ ifndef NO_LIBELF CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT endif - # include ARCH specific config - -include $(src-perf)/arch/$(ARCH)/Makefile - ifndef NO_DWARF ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined) msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled); -- GitLab From 85f8f966a152f5110a12b76511743fbfb62130ba Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Apr 2016 15:58:06 -0700 Subject: [PATCH 130/705] perf list: Document event specifications better Document some features for specifying events in the perf list manpage: - Event groups - Leader sampling - How to specify raw PMU events in the new syntax - Global versus per process PMUs. - Access restrictions - Fix Intel SDM URL v2: Lots of new content. address review feedback. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/1459810686-15913-1-git-send-email-andi@firstfloor.org [ Add quotes to some keywords, such as "any" ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 107 ++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index ec723d0a5bb3f..a126e97a81143 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -93,6 +93,67 @@ raw encoding of 0x1A8 can be used: You should refer to the processor specific documentation for getting these details. Some of them are referenced in the SEE ALSO section below. +ARBITRARY PMUS +-------------- + +perf also supports an extended syntax for specifying raw parameters +to PMUs. Using this typically requires looking up the specific event +in the CPU vendor specific documentation. + +The available PMUs and their raw parameters can be listed with + + ls /sys/devices/*/format + +For example the raw event "LSD.UOPS" core pmu event above could +be specified as + + perf stat -e cpu/event=0xa8,umask=0x1,name=LSD.UOPS_CYCLES,cmask=1/ ... + +PER SOCKET PMUS +--------------- + +Some PMUs are not associated with a core, but with a whole CPU socket. +Events on these PMUs generally cannot be sampled, but only counted globally +with perf stat -a. They can be bound to one logical CPU, but will measure +all the CPUs in the same socket. + +This example measures memory bandwidth every second +on the first memory controller on socket 0 of a Intel Xeon system + + perf stat -C 0 -a uncore_imc_0/cas_count_read/,uncore_imc_0/cas_count_write/ -I 1000 ... + +Each memory controller has its own PMU. Measuring the complete system +bandwidth would require specifying all imc PMUs (see perf list output), +and adding the values together. + +This example measures the combined core power every second + + perf stat -I 1000 -e power/energy-cores/ -a + +ACCESS RESTRICTIONS +------------------- + +For non root users generally only context switched PMU events are available. +This is normally only the events in the cpu PMU, the predefined events +like cycles and instructions and some software events. + +Other PMUs and global measurements are normally root only. +Some event qualifiers, such as "any", are also root only. + +This can be overriden by setting the kernel.perf_event_paranoid +sysctl to -1, which allows non root to use these events. + +For accessing trace point events perf needs to have read access to +/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed +setting. + +TRACING +------- + +Some PMUs control advanced hardware tracing capabilities, such as Intel PT, +that allows low overhead execution tracing. These are described in a separate +intel-pt.txt document. + PARAMETERIZED EVENTS -------------------- @@ -106,6 +167,50 @@ also be supplied. For example: perf stat -C 0 -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ... +EVENT GROUPS +------------ + +Perf supports time based multiplexing of events, when the number of events +active exceeds the number of hardware performance counters. Multiplexing +can cause measurement errors when the workload changes its execution +profile. + +When metrics are computed using formulas from event counts, it is useful to +ensure some events are always measured together as a group to minimize multiplexing +errors. Event groups can be specified using { }. + + perf stat -e '{instructions,cycles}' ... + +The number of available performance counters depend on the CPU. A group +cannot contain more events than available counters. +For example Intel Core CPUs typically have four generic performance counters +for the core, plus three fixed counters for instructions, cycles and +ref-cycles. Some special events have restrictions on which counter they +can schedule, and may not support multiple instances in a single group. +When too many events are specified in the group none of them will not +be measured. + +Globally pinned events can limit the number of counters available for +other groups. On x86 systems, the NMI watchdog pins a counter by default. +The nmi watchdog can be disabled as root with + + echo 0 > /proc/sys/kernel/nmi_watchdog + +Events from multiple different PMUs cannot be mixed in a group, with +some exceptions for software events. + +LEADER SAMPLING +--------------- + +perf also supports group leader sampling using the :S specifier. + + perf record -e '{cycles,instructions}:S' ... + perf report --group + +Normally all events in a event group sample, but with :S only +the first event (the leader) samples, and it only reads the values of the +other events in the group. + OPTIONS ------- @@ -143,5 +248,5 @@ SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-top[1], linkperf:perf-record[1], -http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide], +http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide], http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming] -- GitLab From a3bca91f2fe54af502deaf277dd5ac0e18bffde4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 Apr 2016 12:51:33 -0300 Subject: [PATCH 131/705] perf trace: Beautify sched_setscheduler 'policy' argument $ trace -e sched_setscheduler chrt -f 1 usleep 1 chrt: failed to set pid 0's policy: Operation not permitted 0.005 ( 0.005 ms): chrt/19189 sched_setscheduler(policy: FIFO, param: 0x7ffec5273d70) = -1 EPERM Operation not permitted $ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-i5vlo5n5jv0amt8bkyicmdxh@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 4 +++ tools/perf/trace/beauty/sched_policy.c | 44 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 tools/perf/trace/beauty/sched_policy.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d309f4535a45a..c283153d8c7f0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1073,6 +1073,8 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } +#include "trace/beauty/sched_policy.c" + static struct syscall_fmt { const char *name; const char *alias; @@ -1304,6 +1306,8 @@ static struct syscall_fmt { .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, { .name = "rt_tgsigqueueinfo", .errmsg = true, .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, + { .name = "sched_setscheduler", .errmsg = true, + .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, { .name = "seccomp", .errmsg = true, .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c new file mode 100644 index 0000000000000..c205bc608b3cf --- /dev/null +++ b/tools/perf/trace/beauty/sched_policy.c @@ -0,0 +1,44 @@ +#include + +/* + * Not defined anywhere else, probably, just to make sure we + * catch future flags + */ +#define SCHED_POLICY_MASK 0xff + +#ifndef SCHED_DEADLINE +#define SCHED_DEADLINE 6 +#endif + +static size_t syscall_arg__scnprintf_sched_policy(char *bf, size_t size, + struct syscall_arg *arg) +{ + const char *policies[] = { + "NORMAL", "FIFO", "RR", "BATCH", "ISO", "IDLE", "DEADLINE", + }; + size_t printed; + int policy = arg->val, + flags = policy & ~SCHED_POLICY_MASK; + + policy &= SCHED_POLICY_MASK; + if (policy <= SCHED_DEADLINE) + printed = scnprintf(bf, size, "%s", policies[policy]); + else + printed = scnprintf(bf, size, "%#x", policy); + +#define P_POLICY_FLAG(n) \ + if (flags & SCHED_##n) { \ + printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ + flags &= ~SCHED_##n; \ + } + + P_POLICY_FLAG(RESET_ON_FORK); +#undef P_POLICY_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "|%#x", flags); + + return printed; +} + +#define SCA_SCHED_POLICY syscall_arg__scnprintf_sched_policy -- GitLab From 7206b900e6e4b7f4c2e766eab64ea1ca5303e421 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 Apr 2016 14:11:36 -0300 Subject: [PATCH 132/705] perf trace: Beautify wait4/waitid 'options' argument # trace -e waitid,wait4 0.557 ( 0.557 ms): bash/27335 wait4(upid: -1, stat_addr: 0x7ffd02f449f0) = 27336 1.250 ( 0.685 ms): bash/27335 wait4(upid: -1, stat_addr: 0x7ffd02f449f0) = 27337 1.312 ( 0.002 ms): bash/27335 wait4(upid: -1, stat_addr: 0x7ffd02f44690, options: NOHANG) = -1 ECHILD No child processes 1.550 ( 0.015 ms): bash/3856 wait4(upid: -1, stat_addr: 0x7ffd02f44990, options: NOHANG|UNTRACED|CONTINUED) = 27335 1.552 ( 0.001 ms): bash/3856 wait4(upid: -1, stat_addr: 0x7ffd02f44990, options: NOHANG|UNTRACED|CONTINUED) = 0 # Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-i5vlo5n5jv0amt8bkyicmdxh@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 5 +++++ tools/perf/trace/beauty/waitid_options.c | 26 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tools/perf/trace/beauty/waitid_options.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c283153d8c7f0..9a6c7b1fd5a15 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1074,6 +1074,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_parm = { [arg] = &strarray__##array, } #include "trace/beauty/sched_policy.c" +#include "trace/beauty/waitid_options.c" static struct syscall_fmt { const char *name; @@ -1364,6 +1365,10 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "vmsplice", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, + { .name = "wait4", .errmsg = true, + .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, + { .name = "waitid", .errmsg = true, + .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, { .name = "write", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "writev", .errmsg = true, diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c new file mode 100644 index 0000000000000..7942724adec8b --- /dev/null +++ b/tools/perf/trace/beauty/waitid_options.c @@ -0,0 +1,26 @@ +#include +#include + +static size_t syscall_arg__scnprintf_waitid_options(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, options = arg->val; + +#define P_OPTION(n) \ + if (options & W##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + options &= ~W##n; \ + } + + P_OPTION(NOHANG); + P_OPTION(UNTRACED); + P_OPTION(CONTINUED); +#undef P_OPTION + + if (options) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", options); + + return printed; +} + +#define SCA_WAITID_OPTIONS syscall_arg__scnprintf_waitid_options -- GitLab From 11c8e39f5133aed9e0f8ffc624c7d5f64c97bc79 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 Apr 2016 14:33:07 -0300 Subject: [PATCH 133/705] perf trace: Infrastructure to show COMM strings for syscalls returning PIDs Starting with clone, waitid and wait4: # trace -e waitid,wait4 1.385 ( 1.385 ms): bash/12122 wait4(upid: -1, stat_addr: 0x7ffe0cee1720, options: UNTRACED|CONTINUED) = 1210 (ls) 1.426 ( 0.002 ms): bash/12122 wait4(upid: -1, stat_addr: 0x7ffe0cee1150, options: NOHANG|UNTRACED|CONTINUED) = 0 3.293 ( 0.604 ms): bash/1211 wait4(upid: -1, stat_addr: 0x7ffe0cee0560 ) = 1214 (sed) 3.342 ( 0.002 ms): bash/1211 wait4(upid: -1, stat_addr: 0x7ffe0cee01d0, options: NOHANG ) = -1 ECHILD No child processes 3.576 ( 0.016 ms): bash/12122 wait4(upid: -1, stat_addr: 0x7ffe0cee0550, options: NOHANG|UNTRACED|CONTINUED) = 1211 (bash) ^C# trace -e clone 0.027 ( 0.000 ms): systemd/1 ... [continued]: clone()) = 1227 (systemd) 0.050 ( 0.000 ms): systemd/1227 ... [continued]: clone()) = 0 ^C[root@jouet ~]# Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-lyf5d3y5j15wikjb6pe6ukoi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 9a6c7b1fd5a15..22a4901d057ba 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1082,6 +1082,7 @@ static struct syscall_fmt { size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); void *arg_parm[6]; bool errmsg; + bool errpid; bool timeout; bool hexret; } syscall_fmts[] = { @@ -1099,6 +1100,7 @@ static struct syscall_fmt { { .name = "chroot", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, + { .name = "clone", .errpid = true, }, { .name = "close", .errmsg = true, .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, { .name = "connect", .errmsg = true, }, @@ -1365,9 +1367,9 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "vmsplice", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, - { .name = "wait4", .errmsg = true, + { .name = "wait4", .errpid = true, .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, - { .name = "waitid", .errmsg = true, + { .name = "waitid", .errpid = true, .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, { .name = "write", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, @@ -2156,7 +2158,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, if (sc->fmt == NULL) { signed_print: fprintf(trace->output, ") = %ld", ret); - } else if (ret < 0 && sc->fmt->errmsg) { + } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { char bf[STRERR_BUFSIZE]; const char *emsg = strerror_r(-ret, bf, sizeof(bf)), *e = audit_errno_to_name(-ret); @@ -2166,7 +2168,16 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, fprintf(trace->output, ") = 0 Timeout"); else if (sc->fmt->hexret) fprintf(trace->output, ") = %#lx", ret); - else + else if (sc->fmt->errpid) { + struct thread *child = machine__find_thread(trace->host, ret, ret); + + if (child != NULL) { + fprintf(trace->output, ") = %ld", ret); + if (child->comm_set) + fprintf(trace->output, " (%s)", thread__comm_str(child)); + thread__put(child); + } + } else goto signed_print; fputc('\n', trace->output); -- GitLab From c65f10701ac68259043ccbfac1979778a1fd7846 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 Apr 2016 14:55:18 -0300 Subject: [PATCH 134/705] perf trace: Beautify set_tid_address, getpid, getppid return values Showing the COMM for that return, if available. # trace -e getpid,getppid,set_tid_address 490.007 ( 0.005 ms): sh/8250 getpid(...) = 8250 (sh) 490.014 ( 0.001 ms): sh/8250 getppid(...) = 7886 (make) 491.156 ( 0.004 ms): install/8251 set_tid_address(tidptr: 0x7f204a9d4ad0) = 8251 (install) ^C Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-psbpplqupatom9x4uohbxid5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 22a4901d057ba..191f4d61eb161 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1166,6 +1166,8 @@ static struct syscall_fmt { { .name = "getdents64", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, + { .name = "getpid", .errpid = true, }, + { .name = "getppid", .errpid = true, }, { .name = "getrandom", .errmsg = true, .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, @@ -1324,6 +1326,7 @@ static struct syscall_fmt { { .name = "sendto", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ [3] = SCA_MSG_FLAGS, /* flags */ }, }, + { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "setxattr", .errmsg = true, -- GitLab From d1d438a3b1eb64eb99fc918d13a52ded3e941d67 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 Apr 2016 18:02:41 -0300 Subject: [PATCH 135/705] perf trace: Beautify pid_t arguments When reading the syscall tracepoint /format file, look for arguments of type "pid_t" and attach the PID beautifier, that will do a lookup on the threads it knows, i.e. the ones that came from PERF_RECORD_COMM events and add the COMM after the pid in such args: Excerpt of a system wide trace for syscalls with pid_t args: 55602.977 ( 0.006 ms): bash/12122 setpgid(pid: 24347 (bash), pgid: 24347 (bash)) = 0 55603.024 ( 0.004 ms): bash/24347 setpgid(pid: 24347 (bash), pgid: 24347 (bash)) = 0 55691.527 (88.397 ms): bash/12122 wait4(upid: -1, stat_addr: 0x7ffe0cee1720, options: UNTRACED|CONTINUED) ... 55692.479 ( 0.952 ms): git/24347 wait4(upid: 24368, stat_addr: 0x7ffe030d5724) ... 55694.549 ( 2.070 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4fc10) = 24369 (pre-commit) 55694.575 ( 0.002 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4f650, options: NOHANG) = -1 ECHILD No child processes 55695.934 ( 0.010 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4f2d0, options: NOHANG) = 24370 (git) 55695.937 ( 0.001 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4f2d0, options: NOHANG) = -1 ECHILD No child processes 55717.963 ( 0.000 ms): pre-commit/24371 ... [continued]: wait4()) = 24372 55717.978 (21.468 ms): :24371/24371 wait4(upid: -1, stat_addr: 0x7ffc94f4f230) ... 55718.087 ( 0.109 ms): pre-commit/24371 wait4(upid: -1, stat_addr: 0x7ffc94f4f230) = 24373 (tr) 55718.187 ( 0.096 ms): pre-commit/24371 wait4(upid: -1, stat_addr: 0x7ffc94f4f230) = 24374 (wc) 55718.218 ( 0.002 ms): pre-commit/24371 wait4(upid: -1, stat_addr: 0x7ffc94f4eed0, options: NOHANG) = -1 ECHILD No child processes 55718.367 ( 0.005 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4f1d0, options: NOHANG) = 24371 (pre-commit) 55718.369 ( 0.001 ms): pre-commit/24368 wait4(upid: -1, stat_addr: 0x7ffc94f4f1d0, options: NOHANG) = -1 ECHILD No child processes 55741.021 (49.494 ms): git/24347 ... [continued]: wait4()) = 24368 (pre-commit) 74146.427 (18319.601 ms): git/24347 wait4(upid: 24375 (git), stat_addr: 0x7ffe030d6824) ... 74149.036 ( 0.891 ms): bash/24391 wait4(upid: -1, stat_addr: 0x7ffe0cee0560) = 24393 (sed) Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-75yl9hzjhb020iadc81gdj8t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 110 ++++++++++++++++++---------------- tools/perf/trace/beauty/pid.c | 18 ++++++ 2 files changed, 75 insertions(+), 53 deletions(-) create mode 100644 tools/perf/trace/beauty/pid.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 191f4d61eb161..57d4bb473add3 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -112,6 +112,58 @@ # define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ #endif +struct trace { + struct perf_tool tool; + struct { + int machine; + int open_id; + } audit; + struct { + int max; + struct syscall *table; + struct { + struct perf_evsel *sys_enter, + *sys_exit; + } events; + } syscalls; + struct record_opts opts; + struct perf_evlist *evlist; + struct machine *host; + struct thread *current; + u64 base_time; + FILE *output; + unsigned long nr_events; + struct strlist *ev_qualifier; + struct { + size_t nr; + int *entries; + } ev_qualifier_ids; + struct intlist *tid_list; + struct intlist *pid_list; + struct { + size_t nr; + pid_t *entries; + } filter_pids; + double duration_filter; + double runtime_ms; + struct { + u64 vfs_getname, + proc_getname; + } stats; + bool not_ev_qualifier; + bool live; + bool full_time; + bool sched; + bool multiple_threads; + bool summary; + bool summary_only; + bool show_comm; + bool show_tool_stats; + bool trace_syscalls; + bool force; + bool vfs_getname; + int trace_pgfaults; +}; struct tp_field { int offset; @@ -1073,6 +1125,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } +#include "trace/beauty/pid.c" #include "trace/beauty/sched_policy.c" #include "trace/beauty/waitid_options.c" @@ -1167,6 +1220,7 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, { .name = "getpid", .errpid = true, }, + { .name = "getpgid", .errpid = true, }, { .name = "getppid", .errpid = true, }, { .name = "getrandom", .errmsg = true, .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, @@ -1328,6 +1382,7 @@ static struct syscall_fmt { [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, + { .name = "setpgid", .errmsg = true, }, { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "setxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, @@ -1485,59 +1540,6 @@ static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) static const size_t trace__entry_str_size = 2048; -struct trace { - struct perf_tool tool; - struct { - int machine; - int open_id; - } audit; - struct { - int max; - struct syscall *table; - struct { - struct perf_evsel *sys_enter, - *sys_exit; - } events; - } syscalls; - struct record_opts opts; - struct perf_evlist *evlist; - struct machine *host; - struct thread *current; - u64 base_time; - FILE *output; - unsigned long nr_events; - struct strlist *ev_qualifier; - struct { - size_t nr; - int *entries; - } ev_qualifier_ids; - struct intlist *tid_list; - struct intlist *pid_list; - struct { - size_t nr; - pid_t *entries; - } filter_pids; - double duration_filter; - double runtime_ms; - struct { - u64 vfs_getname, - proc_getname; - } stats; - bool not_ev_qualifier; - bool live; - bool full_time; - bool sched; - bool multiple_threads; - bool summary; - bool summary_only; - bool show_comm; - bool show_tool_stats; - bool trace_syscalls; - bool force; - bool vfs_getname; - int trace_pgfaults; -}; - static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) { struct thread_trace *ttrace = thread__priv(thread); @@ -1763,6 +1765,8 @@ static int syscall__set_arg_fmts(struct syscall *sc) sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; else if (field->flags & FIELD_IS_POINTER) sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; + else if (strcmp(field->type, "pid_t") == 0) + sc->arg_scnprintf[idx] = SCA_PID; ++idx; } diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c new file mode 100644 index 0000000000000..111ae08d38f10 --- /dev/null +++ b/tools/perf/trace/beauty/pid.c @@ -0,0 +1,18 @@ +static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg) +{ + int pid = arg->val; + struct trace *trace = arg->trace; + size_t printed = scnprintf(bf, size, "%d", pid); + struct thread *thread = machine__find_thread(trace->host, pid, pid); + + if (thread != NULL) { + if (thread->comm_set) + printed += scnprintf(bf + printed, size - printed, + " (%s)", thread__comm_str(thread)); + thread__put(thread); + } + + return printed; +} + +#define SCA_PID syscall_arg__scnprintf_pid -- GitLab From 7d6a7e782558323364bc0ae59f3523175c10b258 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Apr 2016 09:11:11 +0200 Subject: [PATCH 136/705] perf tools: Introduce trim function To be used in cases for both sides trim. Signed-off-by: Jiri Olsa Cc: Andreas Hollmann Cc: David Ahern Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460013073-18444-1-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 3 +-- tools/perf/ui/stdio/hist.c | 3 +-- tools/perf/util/util.h | 5 +++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 2a83414159a65..e70df2e54d667 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1607,9 +1607,8 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists)); dummy_hpp.buf[ret] = '\0'; - rtrim(dummy_hpp.buf); - start = ltrim(dummy_hpp.buf); + start = trim(dummy_hpp.buf); ret = strlen(start); if (start != dummy_hpp.buf) diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 7aff5acf32657..560eb47d56f94 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -569,9 +569,8 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp, first_col = false; fmt->header(fmt, hpp, hists_to_evsel(hists)); - rtrim(hpp->buf); - header_width += fprintf(fp, "%s", ltrim(hpp->buf)); + header_width += fprintf(fp, "%s", trim(hpp->buf)); } } diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 8298d607c7383..3bf3de86d4297 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -254,6 +254,11 @@ int hex2u64(const char *ptr, u64 *val); char *ltrim(char *s); char *rtrim(char *s); +static inline char *trim(char *s) +{ + return ltrim(rtrim(s)); +} + void dump_stack(void); void sighandler_dump_stack(int sig); -- GitLab From e583d70c54976f81855c7ca763b036bad399f4e0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Apr 2016 09:11:12 +0200 Subject: [PATCH 137/705] perf tools: Add dedicated unwind addr_space member into thread struct Milian reported issue with thread::priv, which was double booked by perf trace and DWARF unwind code. So using those together is impossible at the moment. Moving DWARF unwind private data into separate variable so perf trace can keep using thread::priv. Reported-and-Tested-by: Milian Wolff Signed-off-by: Jiri Olsa Cc: Andreas Hollmann Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460013073-18444-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.h | 6 ++++++ tools/perf/util/unwind-libunwind.c | 25 +++++++++---------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index a0ac0317affb5..e214207bb13ac 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -9,6 +9,9 @@ #include "symbol.h" #include #include +#ifdef HAVE_LIBUNWIND_SUPPORT +#include +#endif struct thread_stack; @@ -32,6 +35,9 @@ struct thread { void *priv; struct thread_stack *ts; +#ifdef HAVE_LIBUNWIND_SUPPORT + unw_addr_space_t addr_space; +#endif }; struct machine; diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index ee7e372297e59..63687d3a344e7 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -32,6 +32,7 @@ #include "symbol.h" #include "util.h" #include "debug.h" +#include "asm/bug.h" extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, @@ -580,43 +581,33 @@ static unw_accessors_t accessors = { int unwind__prepare_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return 0; - addr_space = unw_create_addr_space(&accessors, 0); - if (!addr_space) { + thread->addr_space = unw_create_addr_space(&accessors, 0); + if (!thread->addr_space) { pr_err("unwind: Can't create unwind address space.\n"); return -ENOMEM; } - unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL); - thread__set_priv(thread, addr_space); - + unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL); return 0; } void unwind__flush_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return; - addr_space = thread__priv(thread); - unw_flush_cache(addr_space, 0, 0); + unw_flush_cache(thread->addr_space, 0, 0); } void unwind__finish_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return; - addr_space = thread__priv(thread); - unw_destroy_addr_space(addr_space); + unw_destroy_addr_space(thread->addr_space); } static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, @@ -639,7 +630,9 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, * unwind itself. */ if (max_stack - 1 > 0) { - addr_space = thread__priv(ui->thread); + WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL"); + addr_space = ui->thread->addr_space; + if (addr_space == NULL) return -1; -- GitLab From 91daee306a51ca7b4d3ca7fdcf7472b0ed2c80c1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Apr 2016 09:11:13 +0200 Subject: [PATCH 138/705] perf script: Process event update events Andreas reported following command produces no output: # cat test.py #!/usr/bin/env python def stat__krava(cpu, thread, time, val, ena, run): print "event %s cpu %d, thread %d, time %d, val %d, ena %d, run %d" % \ ("krava", cpu, thread, time, val, ena, run) # perf stat -a -I 1000 -e cycles,"cpu/config=0x6530160,name=krava/" record | perf script -s test.py ^C # The reason is that 'perf script' does not process event update events and will never get the event name update thus the python callback is never called. The fix is just to add already existing callback we use in 'perf stat report'. Committer note: After the patch: # perf stat -a -I 1000 -e cycles,"cpu/config=0x6530160,name=krava/" record | perf script -s test.py event krava cpu -1, thread -1, time 1000239179, val 1789051, ena 4000690920, run 4000690920 event krava cpu -1, thread -1, time 2000479061, val 2391338, ena 4000879596, run 4000879596 event krava cpu -1, thread -1, time 3000740802, val 1939121, ena 4000977209, run 4000977209 event krava cpu -1, thread -1, time 4001006730, val 2356115, ena 4001000489, run 4001000489 ^C # Reported-by: Andreas Hollmann Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460013073-18444-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3770c3dffe5e1..59009aa7e2ca4 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1961,6 +1961,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) .exit = perf_event__process_exit, .fork = perf_event__process_fork, .attr = process_attr, + .event_update = perf_event__process_event_update, .tracing_data = perf_event__process_tracing_data, .build_id = perf_event__process_build_id, .id_index = perf_event__process_id_index, -- GitLab From ba2f22cf9989561c08225f0e88078d5562832313 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Apr 2016 12:05:51 -0300 Subject: [PATCH 139/705] perf trace: Beautify mode_t arguments When reading the syscall tracepoint /format file, look for arguments of type "mode_t" and attach a beautifier: [root@jouet ~]# cat ~/bin/tp_with_fields_of_type #!/bin/bash grep -w $1 /sys/kernel/tracing/events/syscalls/*/format | sed -r 's%.*sys_enter_(.*)/format.*%\1%g' | paste -d, -s # tp_with_fields_of_type umode_t chmod,creat,fchmodat,fchmod,mkdirat,mkdir,mknodat,mknod,mq_open,openat,open # Testing it: #define S_ISUID 0004000 #define S_ISGID 0002000 #define S_ISVTX 0001000 #define S_IRWXU 0000700 #define S_IRUSR 0000400 #define S_IWUSR 0000200 #define S_IXUSR 0000100 #define S_IRWXG 0000070 #define S_IRGRP 0000040 #define S_IWGRP 0000020 #define S_IXGRP 0000010 #define S_IRWXO 0000007 #define S_IROTH 0000004 #define S_IWOTH 0000002 #define S_IXOTH 0000001 # for mode in 4000 2000 1000 700 400 200 100 70 40 20 10 7 4 2 1 ; do \ echo -n $mode '->' ; trace --no-inherit -e chmod,fchmodat,fchmod chmod $mode x; \ done 4000 -> 0.338 ( 0.012 ms): fchmodat(dfd: CWD, filename: x, mode: ISUID) = 0 2000 -> 0.438 ( 0.015 ms): fchmodat(dfd: CWD, filename: x, mode: ISGID) = 0 1000 -> 0.677 ( 0.040 ms): fchmodat(dfd: CWD, filename: x, mode: ISVTX) = 0 700 -> 0.394 ( 0.013 ms): fchmodat(dfd: CWD, filename: x, mode: IRWXU) = 0 400 -> 0.337 ( 0.010 ms): fchmodat(dfd: CWD, filename: x, mode: IRUSR) = 0 200 -> 0.259 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: IWUSR) = 0 100 -> 0.249 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: IXUSR) = 0 70 -> 0.266 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: IRWXG) = 0 40 -> 0.329 ( 0.009 ms): fchmodat(dfd: CWD, filename: x, mode: IRGRP) = 0 20 -> 0.250 ( 0.009 ms): fchmodat(dfd: CWD, filename: x, mode: IWGRP) = 0 10 -> 0.259 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: IXGRP) = 0 7 -> 0.249 ( 0.009 ms): fchmodat(dfd: CWD, filename: x, mode: IRWXO) = 0 4 -> 0.278 ( 0.011 ms): fchmodat(dfd: CWD, filename: x, mode: IROTH) = 0 2 -> 0.276 ( 0.009 ms): fchmodat(dfd: CWD, filename: x, mode: IWOTH) = 0 1 -> 0.250 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: IXOTH) = 0 # # trace --no-inherit -e chmod,fchmodat,fchmod chmod 7777 x 0.258 ( 0.011 ms): fchmodat(dfd: CWD, filename: x, mode: IALLUGO) = 0 # trace --no-inherit -e chmod,fchmodat,fchmod chmod 7770 x 0.258 ( 0.008 ms): fchmodat(dfd: CWD, filename: x, mode: ISUID|ISGID|ISVTX|IRWXU|IRWXG) = 0 # trace --no-inherit -e chmod,fchmodat,fchmod chmod 777 x 0.293 ( 0.012 ms): fchmodat(dfd: CWD, filename: x, mode: IRWXUGO # Now lets see if check by using the tracepoint for that specific syscall, instead of raw_syscalls:sys_enter as 'trace' does for its strace fu: # trace --no-inherit --ev syscalls:sys_enter_fchmodat -e fchmodat chmod 666 x 0.255 ( ): syscalls:sys_enter_fchmodat:dfd: 0xffffffffffffff9c, filename: 0x55db32a3f0f0, mode: 0x000001b6) 0.268 ( 0.012 ms): fchmodat(dfd: CWD, filename: x, mode: IRUGO|IWUGO ) = 0 # Perfect, 0x1bc == 0666. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-18e8zfgbkj83xo87yoom43kd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 3 ++ tools/perf/trace/beauty/mode_t.c | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tools/perf/trace/beauty/mode_t.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 57d4bb473add3..8440e2b92c6c4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1126,6 +1126,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_parm = { [arg] = &strarray__##array, } #include "trace/beauty/pid.c" +#include "trace/beauty/mode_t.c" #include "trace/beauty/sched_policy.c" #include "trace/beauty/waitid_options.c" @@ -1767,6 +1768,8 @@ static int syscall__set_arg_fmts(struct syscall *sc) sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; else if (strcmp(field->type, "pid_t") == 0) sc->arg_scnprintf[idx] = SCA_PID; + else if (strcmp(field->type, "umode_t") == 0) + sc->arg_scnprintf[idx] = SCA_MODE_T; ++idx; } diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c new file mode 100644 index 0000000000000..930d8fef24008 --- /dev/null +++ b/tools/perf/trace/beauty/mode_t.c @@ -0,0 +1,68 @@ +#include +#include +#include + +/* From include/linux/stat.h */ +#ifndef S_IRWXUGO +#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +#endif +#ifndef S_IALLUGO +#define S_IALLUGO (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO) +#endif +#ifndef S_IRUGO +#define S_IRUGO (S_IRUSR|S_IRGRP|S_IROTH) +#endif +#ifndef S_IWUGO +#define S_IWUGO (S_IWUSR|S_IWGRP|S_IWOTH) +#endif +#ifndef S_IXUGO +#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH) +#endif + +static size_t syscall_arg__scnprintf_mode_t(char *bf, size_t size, struct syscall_arg *arg) +{ + int printed = 0, mode = arg->val; + +#define P_MODE(n) \ + if ((mode & S_##n) == S_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + mode &= ~S_##n; \ + } + + P_MODE(IALLUGO); + P_MODE(IRWXUGO); + P_MODE(IRUGO); + P_MODE(IWUGO); + P_MODE(IXUGO); + P_MODE(IFMT); + P_MODE(IFSOCK); + P_MODE(IFLNK); + P_MODE(IFREG); + P_MODE(IFBLK); + P_MODE(IFDIR); + P_MODE(IFCHR); + P_MODE(IFIFO); + P_MODE(ISUID); + P_MODE(ISGID); + P_MODE(ISVTX); + P_MODE(IRWXU); + P_MODE(IRUSR); + P_MODE(IWUSR); + P_MODE(IXUSR); + P_MODE(IRWXG); + P_MODE(IRGRP); + P_MODE(IWGRP); + P_MODE(IXGRP); + P_MODE(IRWXO); + P_MODE(IROTH); + P_MODE(IWOTH); + P_MODE(IXOTH); +#undef P_MODE + + if (mode) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", mode); + + return printed; +} + +#define SCA_MODE_T syscall_arg__scnprintf_mode_t -- GitLab From fd0db10268b3729eb466fd726a39ce7d800bb150 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 4 Apr 2016 13:32:20 -0300 Subject: [PATCH 140/705] perf trace: Move syscall table id <-> name routines to separate class We're using libaudit for doing name to id and id to syscall name translations, but that makes 'perf trace' to have to wait for newer libaudit versions supporting recently added syscalls, such as "userfaultfd" at the time of this changeset. We have all the information right there, in the kernel sources, so move this code to a separate place, wrapped behind functions that will progressively use the kernel source files to extract the syscall table for use in 'perf trace'. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-i38opd09ow25mmyrvfwnbvkj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 24 +++++++++----------- tools/perf/util/Build | 1 + tools/perf/util/syscalltbl.c | 43 ++++++++++++++++++++++++++++++++++++ tools/perf/util/syscalltbl.h | 16 ++++++++++++++ 4 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 tools/perf/util/syscalltbl.c create mode 100644 tools/perf/util/syscalltbl.h diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8440e2b92c6c4..11290b57ce049 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -34,8 +34,9 @@ #include "trace-event.h" #include "util/parse-events.h" #include "util/bpf-loader.h" +#include "syscalltbl.h" -#include +#include /* FIXME: Still needed for audit_errno_to_name */ #include #include #include @@ -114,10 +115,7 @@ struct trace { struct perf_tool tool; - struct { - int machine; - int open_id; - } audit; + struct syscalltbl *sctbl; struct { int max; struct syscall *table; @@ -163,6 +161,7 @@ struct trace { bool force; bool vfs_getname; int trace_pgfaults; + int open_id; }; struct tp_field { @@ -1780,7 +1779,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) { char tp_name[128]; struct syscall *sc; - const char *name = audit_syscall_to_name(id, trace->audit.machine); + const char *name = syscalltbl__name(trace->sctbl, id); if (name == NULL) return -1; @@ -1855,7 +1854,7 @@ static int trace__validate_ev_qualifier(struct trace *trace) strlist__for_each(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = audit_name_to_syscall(sc, trace->audit.machine); + int id = syscalltbl__id(trace->sctbl, sc); if (id < 0) { if (err == 0) { @@ -2137,7 +2136,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); - if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) { + if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { trace__set_fd_pathname(thread, ret, ttrace->filename.name); ttrace->filename.pending_open = false; ++trace->stats.vfs_getname; @@ -3189,10 +3188,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) NULL }; struct trace trace = { - .audit = { - .machine = audit_detect_machine(), - .open_id = audit_name_to_syscall("open", trace.audit.machine), - }, .syscalls = { . max = -1, }, @@ -3267,8 +3262,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) signal(SIGFPE, sighandler_dump_stack); trace.evlist = perf_evlist__new(); + trace.sctbl = syscalltbl__new(); - if (trace.evlist == NULL) { + if (trace.evlist == NULL || trace.sctbl == NULL) { pr_err("Not enough memory to run!\n"); err = -ENOMEM; goto out; @@ -3306,6 +3302,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) } } + trace.open_id = syscalltbl__id(trace.sctbl, "open"); + if (ev_qualifier_str != NULL) { const char *s = ev_qualifier_str; struct strlist_config slist_config = { diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 85ceff357769b..3443646d8da39 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -38,6 +38,7 @@ libperf-y += machine.o libperf-y += map.o libperf-y += pstack.o libperf-y += session.o +libperf-$(CONFIG_AUDIT) += syscalltbl.o libperf-y += ordered-events.o libperf-y += comm.o libperf-y += thread.o diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c new file mode 100644 index 0000000000000..1f13e57412eb5 --- /dev/null +++ b/tools/perf/util/syscalltbl.c @@ -0,0 +1,43 @@ +/* + * System call table mapper + * + * (C) 2016 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include "syscalltbl.h" +#include +#include + + +struct syscalltbl *syscalltbl__new(void) +{ + struct syscalltbl *tbl = malloc(sizeof(*tbl)); + if (tbl) { + tbl->audit_machine = audit_detect_machine(); + } + return tbl; +} + +void syscalltbl__delete(struct syscalltbl *tbl) +{ + free(tbl); +} + +const char *syscalltbl__name(const struct syscalltbl *tbl, int id) +{ + return audit_syscall_to_name(id, tbl->audit_machine); +} + +int syscalltbl__id(struct syscalltbl *tbl, const char *name) +{ + return audit_name_to_syscall(name, tbl->audit_machine); +} diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h new file mode 100644 index 0000000000000..9dee73c2e082d --- /dev/null +++ b/tools/perf/util/syscalltbl.h @@ -0,0 +1,16 @@ +#ifndef __PERF_SYSCALLTBL_H +#define __PERF_SYSCALLTBL_H + +struct syscalltbl { + union { + int audit_machine; + }; +}; + +struct syscalltbl *syscalltbl__new(void); +void syscalltbl__delete(struct syscalltbl *tbl); + +const char *syscalltbl__name(const struct syscalltbl *tbl, int id); +int syscalltbl__id(struct syscalltbl *tbl, const char *name); + +#endif /* __PERF_SYSCALLTBL_H */ -- GitLab From 5af56fab2b11769e35ce96613d321bcc0f7b84c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 4 Apr 2016 17:52:18 -0300 Subject: [PATCH 141/705] perf tools: Allow generating per-arch syscall table arrays Tools should use a mechanism similar to arch/x86/entry/syscalls/ to generate a header file with the definitions for two variables: static const char *syscalltbl_x86_64[] = { [0] = "read", [1] = "write", [324] = "membarrier", [325] = "mlock2", [326] = "copy_file_range", }; static const int syscalltbl_x86_64_max_id = 326; In a per arch file that should then be included in tools/perf/util/syscalltbl.c. First one will be for x86_64. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-02uuamkxgccczdth8komspgp@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/syscalltbl.c | 89 +++++++++++++++++++++++++++++++++++- tools/perf/util/syscalltbl.h | 4 ++ 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 1f13e57412eb5..eb74a97b1f11a 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -14,19 +14,103 @@ */ #include "syscalltbl.h" +#include + +#ifdef HAVE_SYSCALL_TABLE +#include #include -#include +#include "util.h" + +struct syscall { + int id; + const char *name; +}; +static int syscallcmpname(const void *vkey, const void *ventry) +{ + const char *key = vkey; + const struct syscall *entry = ventry; + + return strcmp(key, entry->name); +} + +static int syscallcmp(const void *va, const void *vb) +{ + const struct syscall *a = va, *b = vb; + + return strcmp(a->name, b->name); +} + +static int syscalltbl__init_native(struct syscalltbl *tbl) +{ + int nr_entries = 0, i, j; + struct syscall *entries; + + for (i = 0; i <= syscalltbl_native_max_id; ++i) + if (syscalltbl_native[i]) + ++nr_entries; + + entries = tbl->syscalls.entries = malloc(sizeof(struct syscall) * nr_entries); + if (tbl->syscalls.entries == NULL) + return -1; + + for (i = 0, j = 0; i <= syscalltbl_native_max_id; ++i) { + if (syscalltbl_native[i]) { + entries[j].name = syscalltbl_native[i]; + entries[j].id = i; + ++j; + } + } + + qsort(tbl->syscalls.entries, nr_entries, sizeof(struct syscall), syscallcmp); + tbl->syscalls.nr_entries = nr_entries; + return 0; +} struct syscalltbl *syscalltbl__new(void) { struct syscalltbl *tbl = malloc(sizeof(*tbl)); if (tbl) { - tbl->audit_machine = audit_detect_machine(); + if (syscalltbl__init_native(tbl)) { + free(tbl); + return NULL; + } } return tbl; } +void syscalltbl__delete(struct syscalltbl *tbl) +{ + zfree(&tbl->syscalls.entries); + free(tbl); +} + +const char *syscalltbl__name(const struct syscalltbl *tbl __maybe_unused, int id) +{ + return id <= syscalltbl_native_max_id ? syscalltbl_native[id]: NULL; +} + +int syscalltbl__id(struct syscalltbl *tbl, const char *name) +{ + struct syscall *sc = bsearch(name, tbl->syscalls.entries, + tbl->syscalls.nr_entries, sizeof(*sc), + syscallcmpname); + + return sc ? sc->id : -1; +} + +#else /* HAVE_SYSCALL_TABLE */ + +#include + +struct syscalltbl *syscalltbl__new(void) +{ + struct syscalltbl *tbl = malloc(sizeof(*tbl)); + if (tbl) + tbl->audit_machine = audit_detect_machine(); + return tbl; +} + void syscalltbl__delete(struct syscalltbl *tbl) { free(tbl); @@ -41,3 +125,4 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name) { return audit_name_to_syscall(name, tbl->audit_machine); } +#endif /* HAVE_SYSCALL_TABLE */ diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h index 9dee73c2e082d..e2951510484f8 100644 --- a/tools/perf/util/syscalltbl.h +++ b/tools/perf/util/syscalltbl.h @@ -4,6 +4,10 @@ struct syscalltbl { union { int audit_machine; + struct { + int nr_entries; + void *entries; + } syscalls; }; }; -- GitLab From 1b700c9975008615ad470cf79acc8455ce60a695 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 4 Apr 2016 19:05:36 -0300 Subject: [PATCH 142/705] perf tools: Build syscall table .c header from kernel's syscall_64.tbl We used libaudit to map ids to syscall names and vice-versa, but that imposes a delay in supporting new syscalls, having to wait for libaudit to get those new syscalls on its tables. To remove that delay, for x86_64 initially, grab a copy of arch/x86/entry/syscalls/syscall_64.tbl and use it to generate those tables. Syscalls currently not available in audit-libs: # trace -e copy_file_range,membarrier,mlock2,pread64,pwrite64,timerfd_create,userfaultfd Error: Invalid syscall copy_file_range, membarrier, mlock2, pread64, pwrite64, timerfd_create, userfaultfd Hint: try 'perf list syscalls:sys_enter_*' Hint: and: 'man syscalls' # With this patch: # trace -e copy_file_range,membarrier,mlock2,pread64,pwrite64,timerfd_create,userfaultfd 8505.733 ( 0.010 ms): gnome-shell/2519 timerfd_create(flags: 524288) = 36 8506.688 ( 0.005 ms): gnome-shell/2519 timerfd_create(flags: 524288) = 40 30023.097 ( 0.025 ms): qemu-system-x8/24629 pwrite64(fd: 18, buf: 0x7f63ae382000, count: 4096, pos: 529592320) = 4096 31268.712 ( 0.028 ms): qemu-system-x8/24629 pwrite64(fd: 18, buf: 0x7f63afd8b000, count: 4096, pos: 2314133504) = 4096 31268.854 ( 0.016 ms): qemu-system-x8/24629 pwrite64(fd: 18, buf: 0x7f63afda2000, count: 4096, pos: 2314137600) = 4096 Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-51xfjbxevdsucmnbc4ka5r88@git.kernel.org [ Added make dep for 'prepare' in 'LIBPERF_IN', fix by Wang Nan to fix parallell build ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 13 +- tools/perf/arch/x86/Makefile | 23 ++ .../arch/x86/entry/syscalls/syscall_64.tbl | 374 ++++++++++++++++++ .../arch/x86/entry/syscalls/syscalltbl.sh | 39 ++ tools/perf/config/Makefile | 2 +- tools/perf/util/Build | 4 + tools/perf/util/syscalltbl.c | 6 + 7 files changed, 456 insertions(+), 5 deletions(-) create mode 100644 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl create mode 100755 tools/perf/arch/x86/entry/syscalls/syscalltbl.sh diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 58aed81a21ea9..bde8cbae7dd98 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -183,6 +183,11 @@ endif include config/Makefile endif +ifeq ($(config),0) +include $(srctree)/tools/scripts/Makefile.arch +-include arch/$(ARCH)/Makefile +endif + # The FEATURE_DUMP_EXPORT holds location of the actual # FEATURE_DUMP file to be used to bypass feature detection # (for bpf or any other subproject) @@ -388,7 +393,7 @@ endif __build-dir = $(subst $(OUTPUT),,$(dir $@)) build-dir = $(if $(__build-dir),$(__build-dir),.) -prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep +prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep archheaders $(OUTPUT)%.o: %.c prepare FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ @@ -428,7 +433,7 @@ $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) LIBPERF_IN := $(OUTPUT)libperf-in.o -$(LIBPERF_IN): fixdep FORCE +$(LIBPERF_IN): prepare fixdep FORCE $(Q)$(MAKE) $(build)=libperf $(LIB_FILE): $(LIBPERF_IN) @@ -623,7 +628,7 @@ config-clean: $(call QUIET_CLEAN, config) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null -clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected @@ -660,5 +665,5 @@ FORCE: .PHONY: all install clean config-clean strip install-gtk .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell .PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE prepare -.PHONY: libtraceevent_plugins +.PHONY: libtraceevent_plugins archheaders diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index 269af21437353..a33729173b134 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -4,3 +4,26 @@ endif HAVE_KVM_STAT_SUPPORT := 1 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 PERF_HAVE_JITDUMP := 1 + +### +# Syscall table generation +# + +out := $(OUTPUT)arch/x86/include/generated/asm +header := $(out)/syscalls_64.c +sys := $(srctree)/tools/perf/arch/x86/entry/syscalls +systbl := $(sys)/syscalltbl.sh + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +$(header): $(sys)/syscall_64.tbl $(systbl) + @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ + (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \ + || echo "Warning: x86_64's syscall_64.tbl differs from kernel" >&2 )) || true + $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@ + +clean:: + rm -f $(header) + +archheaders: $(header) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl new file mode 100644 index 0000000000000..2e5b565adacc5 --- /dev/null +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -0,0 +1,374 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is "common", "64" or "x32" for this file. +# +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 common rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn sys_rt_sigreturn/ptregs +16 64 ioctl sys_ioctl +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 common clone sys_clone/ptregs +57 common fork sys_fork/ptregs +58 common vfork sys_vfork/ptregs +59 64 execve sys_execve/ptregs +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo +100 common times sys_times +101 64 ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 common rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack sys_sigaltstack +132 common utime sys_utime +133 common mknod sys_mknod +134 64 uselib +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl sys_iopl/ptregs +173 common ioperm sys_ioperm +174 64 create_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 common quotactl sys_quotactl +180 64 nfsservctl +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 common io_cancel sys_io_cancel +211 64 get_thread_area +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes +236 64 vserver +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 common mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 common perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 common setns sys_setns +309 common getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev +312 common kcmp sys_kcmp +313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf +322 64 execveat sys_execveat/ptregs +323 common userfaultfd sys_userfaultfd +324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 +326 common copy_file_range sys_copy_file_range + +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction compat_sys_rt_sigaction +513 x32 rt_sigreturn sys32_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve compat_sys_execve/ptregs +521 x32 ptrace compat_sys_ptrace +522 x32 rt_sigpending compat_sys_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit +545 x32 execveat compat_sys_execveat/ptregs diff --git a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh new file mode 100755 index 0000000000000..49a18b9ad9cf3 --- /dev/null +++ b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh @@ -0,0 +1,39 @@ +#!/bin/sh + +in="$1" +arch="$2" + +syscall_macro() { + nr="$1" + name="$2" + + echo " [$nr] = \"$name\"," +} + +emit() { + nr="$1" + entry="$2" + + syscall_macro "$nr" "$entry" +} + +echo "static const char *syscalltbl_${arch}[] = {" + +sorted_table=$(mktemp /tmp/syscalltbl.XXXXXX) +grep '^[0-9]' "$in" | sort -n > $sorted_table + +max_nr=0 +while read nr abi name entry compat; do + if [ $nr -ge 512 ] ; then # discard compat sycalls + break + fi + + emit "$nr" "$name" + max_nr=$nr +done < $sorted_table + +rm -f $sorted_table + +echo "};" + +echo "#define SYSCALLTBL_${arch}_MAX_ID ${max_nr}" diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index d1e2b856ef0fd..1e46277286c2e 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -27,7 +27,7 @@ NO_PERF_REGS := 1 ifeq ($(ARCH),x86) $(call detected,CONFIG_X86) ifeq (${IS_64_BIT}, 1) - CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT + CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 $(call detected,CONFIG_X86_64) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 3443646d8da39..ea4ac03c1ec81 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -148,6 +148,10 @@ CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET CFLAGS_hweight.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_parse-events.o += -Wno-redundant-decls +$(OUTPUT)util/syscalltbl.o: util/syscalltbl.c arch/x86/entry/syscalls/syscall_64.tbl $(OUTPUT)arch/x86/include/generated/asm/syscalls_64.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index eb74a97b1f11a..bbb4c19575785 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -21,6 +21,12 @@ #include #include "util.h" +#if defined(__x86_64__) +#include +const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID; +static const char **syscalltbl_native = syscalltbl_x86_64; +#endif + struct syscall { int id; const char *name; -- GitLab From a58f7033ba892b7d299954b94471450d72623039 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 7 Apr 2016 10:24:30 +0000 Subject: [PATCH 143/705] perf symbols: Record text offset in dso to calculate objdump address In this patch, the offset of '.text' section is stored into dso and used here to re-calculate address to objdump. In most of the cases, executable code is in '.text' section, so the adjustment made to a symbol in dso__load_sym (using sym.st_value -= shdr.sh_addr - shdr.sh_offset) should equal to 'sym.st_value -= dso->text_offset'. Therefore, adding text_offset back get objdump address from symbol address (rip). However, it is not true for kernel and kernel module since there could be multiple executable sections with different offset. Exclude kernel for this reason. After this patch, even dso->adjust_symbols is set to true for shared objects, map__rip_2objdump() and map__objdump_2mem() would return correct result, so perf behavior of annotate won't be changed. Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Cody P Schafer Cc: He Kuang Cc: Jiri Olsa Cc: Kirill Smelkov Cc: Li Zefan Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460024671-64774-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 171b6d10a04b6..02c31865648b1 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -431,6 +431,13 @@ u64 map__rip_2objdump(struct map *map, u64 rip) if (map->dso->rel) return rip - map->pgoff; + /* + * kernel modules also have DSO_TYPE_USER in dso->kernel, + * but all kernel modules are ET_REL, so won't get here. + */ + if (map->dso->kernel == DSO_TYPE_USER) + return rip + map->dso->text_offset; + return map->unmap_ip(map, rip) - map->reloc; } @@ -454,6 +461,13 @@ u64 map__objdump_2mem(struct map *map, u64 ip) if (map->dso->rel) return map->unmap_ip(map, ip + map->pgoff); + /* + * kernel modules also have DSO_TYPE_USER in dso->kernel, + * but all kernel modules are ET_REL, so won't get here. + */ + if (map->dso->kernel == DSO_TYPE_USER) + return map->unmap_ip(map, ip - map->dso->text_offset); + return ip + map->reloc; } -- GitLab From 99e87f7bb7268cf644add87130590966fd5d0d17 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 7 Apr 2016 10:24:31 +0000 Subject: [PATCH 144/705] perf symbols: Adjust symbol for shared objects He Kuang reported a problem that perf fails to get correct symbol on Android platform in [1]. The problem can be reproduced on normal x86_64 platform. I will describe the reproducing steps in detail at the end of commit message. The reason of this problem is the missing of symbol adjustment for normal shared objects. In most of the cases skipping adjustment is okay. However, when '.text' section have different 'address' and 'offset' the result is wrong. I checked all shared objects in my working platform, only wine dll objects and debug objects (in .debug) have this problem. However, it is common on Android. For example: $ readelf -S ./libsurfaceflinger.so | grep \.text [10] .text PROGBITS 0000000000029030 00012030 This patch enables symbol adjustment for dynamic objects so the symbol address got from elfutils would be adjusted correctly. Now nearly all types of ELF files should adjust symbols. Makes ss->adjust_symbols default to true. Steps to reproduce the problem: $ cat ./Makefile PWD := $(shell pwd) LDFLAGS += "-Wl,-rpath=$(PWD)" CFLAGS += -g main: main.c libbuggy.so libbuggy.so: buggy.c gcc -g -shared -fPIC -Wl,-Ttext-segment=0x200000 $< -o $@ clean: rm -rf main libbuggy.so *.o $ cat ./buggy.c int fib(int x) { return (x == 0) ? 1 : (x == 1) ? 1 : fib(x - 1) + fib(x - 2); } $ cat ./main.c #include extern int fib(int x); int main() { int i; for (i = 0; i < 40; i++) printf("%d\n", fib(i)); return 0; } $ make $ perf record ./main ... $ perf report --stdio # Overhead Command Shared Object Symbol # ........ ....... ................. ............................... # 14.97% main libbuggy.so [.] 0x000000000000066c 8.68% main libbuggy.so [.] 0x00000000000006aa 8.52% main libbuggy.so [.] fib@plt 7.95% main libbuggy.so [.] 0x0000000000000664 5.94% main libbuggy.so [.] 0x00000000000006a9 5.35% main libbuggy.so [.] 0x0000000000000678 ... The correct result should be (after this patch): # Overhead Command Shared Object Symbol # ........ ....... ................. ............................... # 91.47% main libbuggy.so [.] fib 8.52% main libbuggy.so [.] fib@plt 0.00% main [kernel.kallsyms] [k] kmem_cache_free [1] http://lkml.kernel.org/g/1452567507-54013-1-git-send-email-hekuang@huawei.com Signed-off-by: Wang Nan Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Cody P Schafer Cc: He Kuang Cc: Jiri Olsa Cc: Kirill Smelkov Cc: Li Zefan Cc: Masami Hiramatsu Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460024671-64774-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index bc229a74c6a9a..3f9d6798bd183 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -709,17 +709,10 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (ss->opdshdr.sh_type != SHT_PROGBITS) ss->opdsec = NULL; - if (dso->kernel == DSO_TYPE_USER) { - GElf_Shdr shdr; - ss->adjust_symbols = (ehdr.e_type == ET_EXEC || - ehdr.e_type == ET_REL || - dso__is_vdso(dso) || - elf_section_by_name(elf, &ehdr, &shdr, - ".gnu.prelink_undo", - NULL) != NULL); - } else { + if (dso->kernel == DSO_TYPE_USER) + ss->adjust_symbols = true; + else ss->adjust_symbols = elf__needs_adjust_symbols(ehdr); - } ss->name = strdup(name); if (!ss->name) { -- GitLab From a5e8e825bd1704c488bf6a46936aaf3b9f203d6a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 8 Apr 2016 11:25:59 -0300 Subject: [PATCH 145/705] perf script: Use readdir() instead of deprecated readdir_r() The readdir() function is thread safe as long as just one thread uses a DIR, which is the case in 'perf script', so, to avoid breaking the build with glibc-2.23.90 (upcoming 2.24), use it instead of readdir_r(). See: http://man7.org/linux/man-pages/man3/readdir.3.html "However, in modern implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external synchronization is still preferable to the use of the deprecated readdir_r(3) function." Noticed while building on a Fedora Rawhide docker container. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-mt3xz7n2hl49ni2vx7kuq74g@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 70 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 59009aa7e2ca4..8f6ab2ac855ad 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1415,21 +1415,19 @@ static int is_directory(const char *base_path, const struct dirent *dent) return S_ISDIR(st.st_mode); } -#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\ - while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \ - lang_next) \ - if ((lang_dirent.d_type == DT_DIR || \ - (lang_dirent.d_type == DT_UNKNOWN && \ - is_directory(scripts_path, &lang_dirent))) && \ - (strcmp(lang_dirent.d_name, ".")) && \ - (strcmp(lang_dirent.d_name, ".."))) - -#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\ - while (!readdir_r(lang_dir, &script_dirent, &script_next) && \ - script_next) \ - if (script_dirent.d_type != DT_DIR && \ - (script_dirent.d_type != DT_UNKNOWN || \ - !is_directory(lang_path, &script_dirent))) +#define for_each_lang(scripts_path, scripts_dir, lang_dirent) \ + while ((lang_dirent = readdir(scripts_dir)) != NULL) \ + if ((lang_dirent->d_type == DT_DIR || \ + (lang_dirent->d_type == DT_UNKNOWN && \ + is_directory(scripts_path, lang_dirent))) && \ + (strcmp(lang_dirent->d_name, ".")) && \ + (strcmp(lang_dirent->d_name, ".."))) + +#define for_each_script(lang_path, lang_dir, script_dirent) \ + while ((script_dirent = readdir(lang_dir)) != NULL) \ + if (script_dirent->d_type != DT_DIR && \ + (script_dirent->d_type != DT_UNKNOWN || \ + !is_directory(lang_path, script_dirent))) #define RECORD_SUFFIX "-record" @@ -1575,7 +1573,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused, const char *s __maybe_unused, int unset __maybe_unused) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; char script_path[MAXPATHLEN]; @@ -1590,19 +1588,19 @@ static int list_available_scripts(const struct option *opt __maybe_unused, if (!scripts_dir) return -1; - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { - script_root = get_script_root(&script_dirent, REPORT_SUFFIX); + for_each_script(lang_path, lang_dir, script_dirent) { + script_root = get_script_root(script_dirent, REPORT_SUFFIX); if (script_root) { desc = script_desc__findnew(script_root); snprintf(script_path, MAXPATHLEN, "%s/%s", - lang_path, script_dirent.d_name); + lang_path, script_dirent->d_name); read_script_info(desc, script_path); free(script_root); } @@ -1690,7 +1688,7 @@ static int check_ev_match(char *dir_name, char *scriptname, */ int find_scripts(char **scripts_array, char **scripts_path_array) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; struct perf_session *session; @@ -1713,9 +1711,9 @@ int find_scripts(char **scripts_array, char **scripts_path_array) return -1; } - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); #ifdef NO_LIBPERL if (strstr(lang_path, "perl")) continue; @@ -1729,16 +1727,16 @@ int find_scripts(char **scripts_array, char **scripts_path_array) if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { + for_each_script(lang_path, lang_dir, script_dirent) { /* Skip those real time scripts: xxxtop.p[yl] */ - if (strstr(script_dirent.d_name, "top.")) + if (strstr(script_dirent->d_name, "top.")) continue; sprintf(scripts_path_array[i], "%s/%s", lang_path, - script_dirent.d_name); - temp = strchr(script_dirent.d_name, '.'); + script_dirent->d_name); + temp = strchr(script_dirent->d_name, '.'); snprintf(scripts_array[i], - (temp - script_dirent.d_name) + 1, - "%s", script_dirent.d_name); + (temp - script_dirent->d_name) + 1, + "%s", script_dirent->d_name); if (check_ev_match(lang_path, scripts_array[i], session)) @@ -1756,7 +1754,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array) static char *get_script_path(const char *script_root, const char *suffix) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN]; char script_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; @@ -1769,21 +1767,21 @@ static char *get_script_path(const char *script_root, const char *suffix) if (!scripts_dir) return NULL; - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { - __script_root = get_script_root(&script_dirent, suffix); + for_each_script(lang_path, lang_dir, script_dirent) { + __script_root = get_script_root(script_dirent, suffix); if (__script_root && !strcmp(script_root, __script_root)) { free(__script_root); closedir(lang_dir); closedir(scripts_dir); snprintf(script_path, MAXPATHLEN, "%s/%s", - lang_path, script_dirent.d_name); + lang_path, script_dirent->d_name); return strdup(script_path); } free(__script_root); -- GitLab From 3354cf71104de49326d19d2f9bdb1f66eea52ef4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 8 Apr 2016 11:31:24 -0300 Subject: [PATCH 146/705] perf thread_map: Use readdir() instead of deprecated readdir_r() The readdir() function is thread safe as long as just one thread uses a DIR, which is the case in thread_map, so, to avoid breaking the build with glibc-2.23.90 (upcoming 2.24), use it instead of readdir_r(). See: http://man7.org/linux/man-pages/man3/readdir.3.html "However, in modern implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external synchronization is still preferable to the use of the deprecated readdir_r(3) function." Noticed while building on a Fedora Rawhide docker container. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-del8h2a0f40z75j4r42l96l0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread_map.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 08afc69099538..267112b4e3dbe 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -94,7 +94,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid) DIR *proc; int max_threads = 32, items, i; char path[256]; - struct dirent dirent, *next, **namelist = NULL; + struct dirent *dirent, **namelist = NULL; struct thread_map *threads = thread_map__alloc(max_threads); if (threads == NULL) @@ -107,16 +107,16 @@ struct thread_map *thread_map__new_by_uid(uid_t uid) threads->nr = 0; atomic_set(&threads->refcnt, 1); - while (!readdir_r(proc, &dirent, &next) && next) { + while ((dirent = readdir(proc)) != NULL) { char *end; bool grow = false; struct stat st; - pid_t pid = strtol(dirent.d_name, &end, 10); + pid_t pid = strtol(dirent->d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; - snprintf(path, sizeof(path), "/proc/%s", dirent.d_name); + snprintf(path, sizeof(path), "/proc/%s", dirent->d_name); if (stat(path, &st) != 0) continue; -- GitLab From 7093b4c963cc4e344e490c774924a180602a7092 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 8 Apr 2016 11:32:15 -0300 Subject: [PATCH 147/705] perf tools: Use readdir() instead of deprecated readdir_r() The readdir() function is thread safe as long as just one thread uses a DIR, which is the case when synthesizing events for pre-existing threads by traversing /proc, so, to avoid breaking the build with glibc-2.23.90 (upcoming 2.24), use it instead of readdir_r(). See: http://man7.org/linux/man-pages/man3/readdir.3.html "However, in modern implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external synchronization is still preferable to the use of the deprecated readdir_r(3) function." Noticed while building on a Fedora Rawhide docker container. CC /tmp/build/perf/util/event.o util/event.c: In function '__event__synthesize_thread': util/event.c:466:2: error: 'readdir_r' is deprecated [-Werror=deprecated-declarations] while (!readdir_r(tasks, &dirent, &next) && next) { ^~~~~ In file included from /usr/include/features.h:368:0, from /usr/include/stdint.h:25, from /usr/lib/gcc/x86_64-redhat-linux/6.0.0/include/stdint.h:9, from /git/linux/tools/include/linux/types.h:6, from util/event.c:1: /usr/include/dirent.h:189:12: note: declared here Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-i1vj7nyjp2p750rirxgrfd3c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index b689590376880..f6fcc68329499 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -434,7 +434,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, { char filename[PATH_MAX]; DIR *tasks; - struct dirent dirent, *next; + struct dirent *dirent; pid_t tgid, ppid; int rc = 0; @@ -463,11 +463,11 @@ static int __event__synthesize_thread(union perf_event *comm_event, return 0; } - while (!readdir_r(tasks, &dirent, &next) && next) { + while ((dirent = readdir(tasks)) != NULL) { char *end; pid_t _pid; - _pid = strtol(dirent.d_name, &end, 10); + _pid = strtol(dirent->d_name, &end, 10); if (*end) continue; @@ -576,7 +576,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool, { DIR *proc; char proc_path[PATH_MAX]; - struct dirent dirent, *next; + struct dirent *dirent; union perf_event *comm_event, *mmap_event, *fork_event; int err = -1; @@ -601,9 +601,9 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (proc == NULL) goto out_free_fork; - while (!readdir_r(proc, &dirent, &next) && next) { + while ((dirent = readdir(proc)) != NULL) { char *end; - pid_t pid = strtol(dirent.d_name, &end, 10); + pid_t pid = strtol(dirent->d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; -- GitLab From bfc279f3d233150ff260e9e93012e14f86810648 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 8 Apr 2016 11:53:02 -0300 Subject: [PATCH 148/705] perf tools: Use readdir() instead of deprecated readdir_r() The readdir() function is thread safe as long as just one thread uses a DIR, which is the case when parsing tracepoint event definitions, to avoid breaking the build with glibc-2.23.90 (upcoming 2.24), use it instead of readdir_r(). See: http://man7.org/linux/man-pages/man3/readdir.3.html "However, in modern implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external synchronization is still preferable to the use of the deprecated readdir_r(3) function." Noticed while building on a Fedora Rawhide docker container. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wddn49r6bz6wq4ee3dxbl7lo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 60 +++++++++++++++++----------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4c19d5e79d8c4..bcbc983d4b122 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -138,11 +138,11 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { #define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) -#define for_each_subsystem(sys_dir, sys_dirent, sys_next) \ - while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \ - if (sys_dirent.d_type == DT_DIR && \ - (strcmp(sys_dirent.d_name, ".")) && \ - (strcmp(sys_dirent.d_name, ".."))) +#define for_each_subsystem(sys_dir, sys_dirent) \ + while ((sys_dirent = readdir(sys_dir)) != NULL) \ + if (sys_dirent->d_type == DT_DIR && \ + (strcmp(sys_dirent->d_name, ".")) && \ + (strcmp(sys_dirent->d_name, ".."))) static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) { @@ -159,12 +159,12 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) return 0; } -#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) \ - while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \ - if (evt_dirent.d_type == DT_DIR && \ - (strcmp(evt_dirent.d_name, ".")) && \ - (strcmp(evt_dirent.d_name, "..")) && \ - (!tp_event_has_id(&sys_dirent, &evt_dirent))) +#define for_each_event(sys_dirent, evt_dir, evt_dirent) \ + while ((evt_dirent = readdir(evt_dir)) != NULL) \ + if (evt_dirent->d_type == DT_DIR && \ + (strcmp(evt_dirent->d_name, ".")) && \ + (strcmp(evt_dirent->d_name, "..")) && \ + (!tp_event_has_id(sys_dirent, evt_dirent))) #define MAX_EVENT_LENGTH 512 @@ -173,7 +173,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) { struct tracepoint_path *path = NULL; DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char id_buf[24]; int fd; u64 id; @@ -184,18 +184,18 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) if (!sys_dir) return NULL; - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path, - evt_dirent.d_name); + evt_dirent->d_name); fd = open(evt_path, O_RDONLY); if (fd < 0) continue; @@ -220,9 +220,9 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) free(path); return NULL; } - strncpy(path->system, sys_dirent.d_name, + strncpy(path->system, sys_dirent->d_name, MAX_EVENT_LENGTH); - strncpy(path->name, evt_dirent.d_name, + strncpy(path->name, evt_dirent->d_name, MAX_EVENT_LENGTH); return path; } @@ -1812,7 +1812,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob, bool name_only) { DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; char **evt_list = NULL; @@ -1830,20 +1830,20 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob, goto out_close_sys_dir; } - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { if (subsys_glob != NULL && - !strglobmatch(sys_dirent.d_name, subsys_glob)) + !strglobmatch(sys_dirent->d_name, subsys_glob)) continue; snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { if (event_glob != NULL && - !strglobmatch(evt_dirent.d_name, event_glob)) + !strglobmatch(evt_dirent->d_name, event_glob)) continue; if (!evt_num_known) { @@ -1852,7 +1852,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob, } snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent.d_name, evt_dirent.d_name); + sys_dirent->d_name, evt_dirent->d_name); evt_list[evt_i] = strdup(evt_path); if (evt_list[evt_i] == NULL) @@ -1905,7 +1905,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob, int is_valid_tracepoint(const char *event_string) { DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; @@ -1913,17 +1913,17 @@ int is_valid_tracepoint(const char *event_string) if (!sys_dir) return 0; - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent.d_name, evt_dirent.d_name); + sys_dirent->d_name, evt_dirent->d_name); if (!strcmp(evt_path, event_string)) { closedir(evt_dir); closedir(sys_dir); -- GitLab From f9383452a26fc47f62c4ddcfa20ccebb7a09c2d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 8 Apr 2016 12:04:29 -0300 Subject: [PATCH 149/705] perf dwarf: Guard !x86_64 definitions under #ifdef else clause To fix the build on Fedora Rawhide (gcc 6.0.0 20160311 (Red Hat 6.0.0-0.17): CC /tmp/build/perf/arch/x86/util/dwarf-regs.o arch/x86/util/dwarf-regs.c:66:36: error: 'x86_32_regoffset_table' defined but not used [-Werror=unused-const-variable=] static const struct pt_regs_offset x86_32_regoffset_table[] = { ^~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-fghuksc1u8ln82bof4lwcj0o@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/dwarf-regs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c index 9223c164e545d..1f86ee8fb831c 100644 --- a/tools/perf/arch/x86/util/dwarf-regs.c +++ b/tools/perf/arch/x86/util/dwarf-regs.c @@ -63,6 +63,8 @@ struct pt_regs_offset { # define REG_OFFSET_NAME_32(n, r) {.name = n, .offset = offsetof(struct pt_regs, r)} #endif +/* TODO: switching by dwarf address size */ +#ifndef __x86_64__ static const struct pt_regs_offset x86_32_regoffset_table[] = { REG_OFFSET_NAME_32("%ax", eax), REG_OFFSET_NAME_32("%cx", ecx), @@ -75,6 +77,8 @@ static const struct pt_regs_offset x86_32_regoffset_table[] = { REG_OFFSET_END, }; +#define regoffset_table x86_32_regoffset_table +#else static const struct pt_regs_offset x86_64_regoffset_table[] = { REG_OFFSET_NAME_64("%ax", rax), REG_OFFSET_NAME_64("%dx", rdx), @@ -95,11 +99,7 @@ static const struct pt_regs_offset x86_64_regoffset_table[] = { REG_OFFSET_END, }; -/* TODO: switching by dwarf address size */ -#ifdef __x86_64__ #define regoffset_table x86_64_regoffset_table -#else -#define regoffset_table x86_32_regoffset_table #endif /* Minus 1 for the ending REG_OFFSET_END */ -- GitLab From 357f435d8a0d32068c75f3c7176434d992b3adb7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Apr 2016 19:05:19 -0400 Subject: [PATCH 150/705] fix the copy vs. map logics in blk_rq_map_user_iov() Signed-off-by: Al Viro --- block/blk-map.c | 47 ++++++++------------------------------------- include/linux/uio.h | 1 + lib/iov_iter.c | 19 ++++++++++++++++++ 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index a54f0543b956e..b9f88b7751fbd 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,24 +9,6 @@ #include "blk.h" -static bool iovec_gap_to_prv(struct request_queue *q, - struct iovec *prv, struct iovec *cur) -{ - unsigned long prev_end; - - if (!queue_virt_boundary(q)) - return false; - - if (prv->iov_base == NULL && prv->iov_len == 0) - /* prv is not set - don't check */ - return false; - - prev_end = (unsigned long)(prv->iov_base + prv->iov_len); - - return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || - prev_end & queue_virt_boundary(q)); -} - int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; - bool copy = (q->dma_pad_mask & iter->count) || map_data; + bool copy = false; + unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret; - if (!iter || !iter->count) - return -EINVAL; - - iov_for_each(iov, i, *iter) { - unsigned long uaddr = (unsigned long) iov.iov_base; - - if (!iov.iov_len) - return -EINVAL; - - /* - * Keep going so we check length of all segments - */ - if ((uaddr & queue_dma_alignment(q)) || - iovec_gap_to_prv(q, &prv, &iov)) - copy = true; - - prv.iov_base = iov.iov_base; - prv.iov_len = iov.iov_len; - } + if (map_data) + copy = true; + else if (iov_iter_alignment(iter) & align) + copy = true; + else if (queue_virt_boundary(q)) + copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); i = *iter; do { diff --git a/include/linux/uio.h b/include/linux/uio.h index fd9bcfedad42d..1b5d1cd796e2b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -87,6 +87,7 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 5fecddc32b1b4..ca5316e0087b5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -569,6 +569,25 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_alignment); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i) +{ + unsigned long res = 0; + size_t size = i->count; + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0), 0), + (res |= (!res ? 0 : (unsigned long)v.bv_offset) | + (size != v.bv_len ? size : 0)), + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0)) + ); + return res; +} +EXPORT_SYMBOL(iov_iter_gap_alignment); + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) -- GitLab From d78885739a7df111dc7b081f8a09e08a5fcfecc2 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 8 Apr 2016 15:07:24 +0000 Subject: [PATCH 151/705] perf bpf: Clone bpf stdout events in multiple bpf scripts This patch allows cloning bpf-output event configuration among multiple bpf scripts. If there exist a map named '__bpf_output__' and not configured using 'map:__bpf_output__.event=', this patch clones the configuration of another '__bpf_stdout__' map. For example, following command: # perf trace --ev bpf-output/no-inherit,name=evt/ \ --ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \ --ev ./test_bpf_trace2.c usleep 100000 equals to: # perf trace --ev bpf-output/no-inherit,name=evt/ \ --ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \ --ev ./test_bpf_trace2.c/map:__bpf_stdout__.event=evt/ \ usleep 100000 Signed-off-by: Wang Nan Suggested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460128045-97310-4-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 8 +++ tools/perf/builtin-trace.c | 7 ++ tools/perf/util/bpf-loader.c | 124 +++++++++++++++++++++++++++++++++++ tools/perf/util/bpf-loader.h | 19 ++++++ 4 files changed, 158 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 410035c6e300b..e64bd1ee5acb2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1276,6 +1276,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) if (err) return err; + err = bpf__setup_stdout(rec->evlist); + if (err) { + bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); + pr_err("ERROR: Setup BPF stdout failed: %s\n", + errbuf); + return err; + } + err = -ENOMEM; symbol__init(NULL); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 11290b57ce049..27d9870306279 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3273,6 +3273,13 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); + err = bpf__setup_stdout(trace.evlist); + if (err) { + bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); + pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); + goto out; + } + if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 0967ce6019316..67f61a902a08b 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term) return op; } +static struct bpf_map_op * +bpf_map_op__clone(struct bpf_map_op *op) +{ + struct bpf_map_op *newop; + + newop = memdup(op, sizeof(*op)); + if (!newop) { + pr_debug("Failed to alloc bpf_map_op\n"); + return NULL; + } + + INIT_LIST_HEAD(&newop->list); + if (op->key_type == BPF_MAP_KEY_RANGES) { + size_t memsz = op->k.array.nr_ranges * + sizeof(op->k.array.ranges[0]); + + newop->k.array.ranges = memdup(op->k.array.ranges, memsz); + if (!newop->k.array.ranges) { + pr_debug("Failed to alloc indices for map\n"); + free(newop); + return NULL; + } + } + + return newop; +} + +static struct bpf_map_priv * +bpf_map_priv__clone(struct bpf_map_priv *priv) +{ + struct bpf_map_priv *newpriv; + struct bpf_map_op *pos, *newop; + + newpriv = zalloc(sizeof(*newpriv)); + if (!newpriv) { + pr_debug("No enough memory to alloc map private\n"); + return NULL; + } + INIT_LIST_HEAD(&newpriv->ops_list); + + list_for_each_entry(pos, &priv->ops_list, list) { + newop = bpf_map_op__clone(pos); + if (!newop) { + bpf_map_priv__purge(newpriv); + return NULL; + } + list_add_tail(&newop->list, &newpriv->ops_list); + } + + return newpriv; +} + static int bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op) { @@ -1417,6 +1469,70 @@ int bpf__apply_obj_config(void) return 0; } +#define bpf__for_each_map(pos, obj, objtmp) \ + bpf_object__for_each_safe(obj, objtmp) \ + bpf_map__for_each(pos, obj) + +#define bpf__for_each_stdout_map(pos, obj, objtmp) \ + bpf__for_each_map(pos, obj, objtmp) \ + if (bpf_map__get_name(pos) && \ + (strcmp("__bpf_stdout__", \ + bpf_map__get_name(pos)) == 0)) + +int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) +{ + struct bpf_map_priv *tmpl_priv = NULL; + struct bpf_object *obj, *tmp; + struct bpf_map *map; + int err; + bool need_init = false; + + bpf__for_each_stdout_map(map, obj, tmp) { + struct bpf_map_priv *priv; + + err = bpf_map__get_private(map, (void **)&priv); + if (err) + return -BPF_LOADER_ERRNO__INTERNAL; + + /* + * No need to check map type: type should have been + * verified by kernel. + */ + if (!need_init && !priv) + need_init = !priv; + if (!tmpl_priv && priv) + tmpl_priv = priv; + } + + if (!need_init) + return 0; + + if (!tmpl_priv) + return 0; + + bpf__for_each_stdout_map(map, obj, tmp) { + struct bpf_map_priv *priv; + + err = bpf_map__get_private(map, (void **)&priv); + if (err) + return -BPF_LOADER_ERRNO__INTERNAL; + if (priv) + continue; + + priv = bpf_map_priv__clone(tmpl_priv); + if (!priv) + return -ENOMEM; + + err = bpf_map__set_private(map, priv, bpf_map_priv__clear); + if (err) { + bpf_map_priv__clear(map, priv); + return err; + } + } + + return 0; +} + #define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START) #define ERRCODE_OFFSET(c) ERRNO_OFFSET(BPF_LOADER_ERRNO__##c) #define NR_ERRNO (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START) @@ -1590,3 +1706,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size) bpf__strerror_end(buf, size); return 0; } + +int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused, + int err, char *buf, size_t size) +{ + bpf__strerror_head(err, buf, size); + bpf__strerror_end(buf, size); + return 0; +} diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h index be4311944e3da..941e17275aa7d 100644 --- a/tools/perf/util/bpf-loader.h +++ b/tools/perf/util/bpf-loader.h @@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj, size_t size); int bpf__apply_obj_config(void); int bpf__strerror_apply_obj_config(int err, char *buf, size_t size); + +int bpf__setup_stdout(struct perf_evlist *evlist); +int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err, + char *buf, size_t size); + #else static inline struct bpf_object * bpf__prepare_load(const char *filename __maybe_unused, @@ -124,6 +129,12 @@ bpf__apply_obj_config(void) return 0; } +static inline int +bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) +{ + return 0; +} + static inline int __bpf_strerror(char *buf, size_t size) { @@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused, { return __bpf_strerror(buf, size); } + +static inline int +bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused, + int err __maybe_unused, char *buf, + size_t size) +{ + return __bpf_strerror(buf, size); +} #endif #endif -- GitLab From 72c0809856b9174e71eab4e293089f6a114e0d41 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 8 Apr 2016 15:07:25 +0000 Subject: [PATCH 152/705] perf bpf: Automatically create bpf-output event __bpf_stdout__ This patch removes the need to set a bpf-output event in cmdline. By referencing a map named '__bpf_stdout__', perf automatically creates an event for it. For example: # perf record -e ./test_bpf_trace.c usleep 100000 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.012 MB perf.data (2 samples) ] # perf script usleep 4639 [000] 261895.307826: 0 __bpf_stdout__: ffffffff810eb9a1 ... BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a 0008: 42 50 46 20 65 76 65 6e BPF even 0010: 74 21 00 00 t!.. BPF string: "Raise a BPF event!" usleep 4639 [000] 261895.407883: 0 __bpf_stdout__: ffffffff8105d609 ... BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a 0008: 42 50 46 20 65 76 65 6e BPF even 0010: 74 21 00 00 t!.. BPF string: "Raise a BPF event!" perf record -e ./test_bpf_trace.c usleep 100000 equals to: perf record -e bpf-output/no-inherit=1,name=__bpf_stdout__/ \ -e ./test_bpf_trace.c/map:__bpf_stdout__.event=__bpf_stdout__/ \ usleep 100000 Where test_bpf_trace.c is: /************************ BEGIN **************************/ #include struct bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; }; #define SEC(NAME) __attribute__((section(NAME), used)) static u64 (*ktime_get_ns)(void) = (void *)BPF_FUNC_ktime_get_ns; static int (*trace_printk)(const char *fmt, int fmt_size, ...) = (void *)BPF_FUNC_trace_printk; static int (*get_smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id; static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) = (void *)BPF_FUNC_perf_event_output; struct bpf_map_def SEC("maps") __bpf_stdout__ = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), .value_size = sizeof(u32), .max_entries = __NR_CPUS__, }; static inline int __attribute__((always_inline)) func(void *ctx, int type) { char output_str[] = "Raise a BPF event!"; char err_str[] = "BAD %d\n"; int err; err = perf_event_output(ctx, &__bpf_stdout__, get_smp_processor_id(), &output_str, sizeof(output_str)); if (err) trace_printk(err_str, sizeof(err_str), err); return 1; } SEC("func_begin=sys_nanosleep") int func_begin(void *ctx) {return func(ctx, 1);} SEC("func_end=sys_nanosleep%return") int func_end(void *ctx) { return func(ctx, 2);} char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; /************************* END ***************************/ Committer note: Testing with 'perf trace': # trace -e nanosleep --ev test_bpf_stdout.c usleep 1 0.007 ( 0.007 ms): usleep/729 nanosleep(rqtp: 0x7ffc5bbc5fe0) ... 0.007 ( ): __bpf_stdout__:Raise a BPF event!..) 0.008 ( ): perf_bpf_probe:func_begin:(ffffffff81112460)) 0.069 ( ): __bpf_stdout__:Raise a BPF event!..) 0.070 ( ): perf_bpf_probe:func_end:(ffffffff81112460 <- ffffffff81003d92)) 0.072 ( 0.072 ms): usleep/729 ... [continued]: nanosleep()) = 0 # Suggested-and-Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Wang Nan Cc: Jiri Olsa Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460128045-97310-5-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-loader.c | 37 +++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 67f61a902a08b..493307d1414ce 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -1483,6 +1483,7 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) { struct bpf_map_priv *tmpl_priv = NULL; struct bpf_object *obj, *tmp; + struct perf_evsel *evsel = NULL; struct bpf_map *map; int err; bool need_init = false; @@ -1507,8 +1508,16 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) if (!need_init) return 0; - if (!tmpl_priv) - return 0; + if (!tmpl_priv) { + err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/", + NULL); + if (err) { + pr_debug("ERROR: failed to create bpf-output event\n"); + return -err; + } + + evsel = perf_evlist__last(evlist); + } bpf__for_each_stdout_map(map, obj, tmp) { struct bpf_map_priv *priv; @@ -1519,14 +1528,24 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) if (priv) continue; - priv = bpf_map_priv__clone(tmpl_priv); - if (!priv) - return -ENOMEM; + if (tmpl_priv) { + priv = bpf_map_priv__clone(tmpl_priv); + if (!priv) + return -ENOMEM; - err = bpf_map__set_private(map, priv, bpf_map_priv__clear); - if (err) { - bpf_map_priv__clear(map, priv); - return err; + err = bpf_map__set_private(map, priv, bpf_map_priv__clear); + if (err) { + bpf_map_priv__clear(map, priv); + return err; + } + } else if (evsel) { + struct bpf_map_op *op; + + op = bpf_map__add_newop(map, NULL); + if (IS_ERR(op)) + return PTR_ERR(op); + op->op_type = BPF_MAP_OP_SET_EVSEL; + op->v.evsel = evsel; } } -- GitLab From 6186de9a491af030889b372193fc9f38c248e69a Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Mon, 11 Apr 2016 10:18:11 -0300 Subject: [PATCH 153/705] perf evsel: Allow specifying a file to output in perf_evsel__print_ip As this function will be used in 'perf trace'. Cc: Jiri Olsa Link: http://lkml.kernel.org/n/tip-8x297v9utnxq77onikevvlse@git.kernel.org [ Split from a larger patch ] Signed-off-by: Milian Wolff --- tools/perf/builtin-script.c | 4 ++-- tools/perf/util/session.c | 39 ++++++++++++++++++++----------------- tools/perf/util/session.h | 3 ++- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8f6ab2ac855ad..dbf208f0cdc2f 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -580,7 +580,7 @@ static void print_sample_bts(struct perf_sample *sample, } } perf_evsel__print_ip(evsel, sample, al, print_opts, - scripting_max_stack); + scripting_max_stack, stdout); } /* print branch_to information */ @@ -790,7 +790,7 @@ static void process_event(struct perf_script *script, perf_evsel__print_ip(evsel, sample, al, output[attr->type].print_ip_opts, - scripting_max_stack); + scripting_max_stack, stdout); } if (PRINT_FIELD(IREGS)) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ef370557fb9ae..bbac0efbc10c8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1955,7 +1955,8 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, struct addr_location *al, - unsigned int print_opts, unsigned int stack_depth) + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) { struct callchain_cursor_node *node; int print_ip = print_opts & PRINT_IP_OPT_IP; @@ -1992,33 +1993,35 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, goto next; if (print_ip) - printf("%c%16" PRIx64, s, node->ip); + fprintf(fp, "%c%16" PRIx64, s, node->ip); if (node->map) addr = node->map->map_ip(node->map, node->ip); if (print_sym) { - printf(" "); + fprintf(fp, " "); if (print_symoffset) { node_al.addr = addr; node_al.map = node->map; - symbol__fprintf_symname_offs(node->sym, &node_al, stdout); + symbol__fprintf_symname_offs(node->sym, + &node_al, + fp); } else - symbol__fprintf_symname(node->sym, stdout); + symbol__fprintf_symname(node->sym, fp); } if (print_dso) { - printf(" ("); - map__fprintf_dsoname(node->map, stdout); - printf(")"); + fprintf(fp, " ("); + map__fprintf_dsoname(node->map, fp); + fprintf(fp, ")"); } if (print_srcline) map__fprintf_srcline(node->map, addr, "\n ", - stdout); + fp); if (!print_oneline) - printf("\n"); + fprintf(fp, "\n"); stack_depth--; next: @@ -2030,25 +2033,25 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, return; if (print_ip) - printf("%16" PRIx64, sample->ip); + fprintf(fp, "%16" PRIx64, sample->ip); if (print_sym) { - printf(" "); + fprintf(fp, " "); if (print_symoffset) symbol__fprintf_symname_offs(al->sym, al, - stdout); + fp); else - symbol__fprintf_symname(al->sym, stdout); + symbol__fprintf_symname(al->sym, fp); } if (print_dso) { - printf(" ("); - map__fprintf_dsoname(al->map, stdout); - printf(")"); + fprintf(fp, " ("); + map__fprintf_dsoname(al->map, fp); + fprintf(fp, ")"); } if (print_srcline) - map__fprintf_srcline(al->map, al->addr, "\n ", stdout); + map__fprintf_srcline(al->map, al->addr, "\n ", fp); } } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index f96fc9e8c52e7..0ee3d9dbc0999 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -106,7 +106,8 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, struct addr_location *al, - unsigned int print_opts, unsigned int stack_depth); + unsigned int print_opts, unsigned int stack_depth, + FILE *fp); int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); -- GitLab From db3617f362d7e205621c1ccc22b77d224a81ee14 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 10:53:51 -0300 Subject: [PATCH 154/705] perf evsel: Allow passing a left alignment when printing a symbol For callchains, etc where we want it to align just below the syscall name, for instance, in 'perf trace' Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-uk9ekchd67651c625ltaur5y@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 4 ++-- tools/perf/util/session.c | 6 +++++- tools/perf/util/session.h | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index dbf208f0cdc2f..60fde9f5025c1 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -579,7 +579,7 @@ static void print_sample_bts(struct perf_sample *sample, print_opts &= ~PRINT_IP_OPT_SRCLINE; } } - perf_evsel__print_ip(evsel, sample, al, print_opts, + perf_evsel__print_ip(evsel, sample, al, 0, print_opts, scripting_max_stack, stdout); } @@ -788,7 +788,7 @@ static void process_event(struct perf_script *script, else printf("\n"); - perf_evsel__print_ip(evsel, sample, al, + perf_evsel__print_ip(evsel, sample, al, 0, output[attr->type].print_ip_opts, scripting_max_stack, stdout); } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index bbac0efbc10c8..62b6d4051b99c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1954,7 +1954,7 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, } void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, + struct addr_location *al, int left_alignment, unsigned int print_opts, unsigned int stack_depth, FILE *fp) { @@ -1992,6 +1992,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, if (node->sym && node->sym->ignore) goto next; + fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + if (print_ip) fprintf(fp, "%c%16" PRIx64, s, node->ip); @@ -2032,6 +2034,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, if (al->sym && al->sym->ignore) return; + fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + if (print_ip) fprintf(fp, "%16" PRIx64, sample->ip); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 0ee3d9dbc0999..a6bc4ddbae3e8 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -105,7 +105,7 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, + struct addr_location *al, int left_alignment, unsigned int print_opts, unsigned int stack_depth, FILE *fp); -- GitLab From 566a08859f63a33746e25246c5cda0f52528d2e4 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Fri, 8 Apr 2016 13:34:15 +0200 Subject: [PATCH 155/705] perf trace: Add support for printing call chains on sys_exit events. Now, one can print the call chain for every encountered sys_exit event, e.g.: $ perf trace -e nanosleep --call-graph dwarf path/to/ex_sleep 1005.757 (1000.090 ms): ex_sleep/13167 nanosleep(...) = 0 syscall_slow_exit_work ([kernel.kallsyms]) syscall_return_slowpath ([kernel.kallsyms]) int_ret_from_sys_call ([kernel.kallsyms]) __nanosleep (/usr/lib/libc-2.23.so) [unknown] (/usr/lib/libQt5Core.so.5.6.0) QThread::sleep (/usr/lib/libQt5Core.so.5.6.0) main (path/to/ex_sleep) __libc_start_main (/usr/lib/libc-2.23.so) _start (path/to/ex_sleep) Note that it is advised to increase the number of mmap pages to prevent event losses when using this new feature. Often, adding `-m 10M` to the `perf trace` invocation is enough. This feature is also available in strace when built with libunwind via `strace -k`. Performance wise, this solution is much better: $ time find path/to/linux &> /dev/null real 0m0.051s user 0m0.013s sys 0m0.037s $ time perf trace -m 800M --call-graph dwarf find path/to/linux &> /dev/null real 0m2.624s user 0m1.203s sys 0m1.333s $ time strace -k find path/to/linux &> /dev/null real 0m35.398s user 0m10.403s sys 0m23.173s Note that it is currently not possible to configure the print output. Adding such a feature, similar to what is available in `perf script` via its `--fields` knob can be added later on. Signed-off-by: Milian Wolff Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan LPU-Reference: 1460115255-17648-1-git-send-email-milian.wolff@kdab.com [ Split from a larger patch, do not print the IP, left align, remove dup call symbol__init(), added man page entry ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 6 ++++++ tools/perf/builtin-trace.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 13293de8869fe..ed485df16409e 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -117,6 +117,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --syscalls:: Trace system calls. This options is enabled by default. +--call-graph [mode,type,min[,limit],order[,key][,branch]]:: + Setup and enable call-graph (stack chain/backtrace) recording. + See `--call-graph` section in perf-record and perf-report + man pages for details. The ones that are most useful in 'perf trace' + are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'. + --event:: Trace other events, see 'perf list' for a complete list. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 27d9870306279..8c587a8d3742a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -34,6 +34,7 @@ #include "trace-event.h" #include "util/parse-events.h" #include "util/bpf-loader.h" +#include "callchain.h" #include "syscalltbl.h" #include /* FIXME: Still needed for audit_errno_to_name */ @@ -2190,6 +2191,21 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, goto signed_print; fputc('\n', trace->output); + + if (sample->callchain) { + struct addr_location al; + /* TODO: user-configurable print_opts */ + const unsigned int print_opts = PRINT_IP_OPT_SYM + | PRINT_IP_OPT_DSO; + + if (machine__resolve(trace->host, &al, sample) < 0) { + pr_err("problem processing %d event, skipping it.\n", + event->header.type); + goto out_put; + } + perf_evsel__print_ip(evsel, sample, &al, 38, print_opts, + scripting_max_stack, trace->output); + } out: ttrace->entry_pending = false; err = 0; @@ -3250,6 +3266,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) "Trace pagefaults", parse_pagefaults, "maj"), OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), + OPT_CALLBACK(0, "call-graph", &trace.opts, + "record_mode[,record_size]", record_callchain_help, + &record_parse_callchain_opt), OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), OPT_END() @@ -3285,6 +3304,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) trace.opts.sample_time = true; } + if (trace.opts.callgraph_set) + symbol_conf.use_callchain = true; + if (trace.evlist->nr_entries > 0) evlist__set_evsel_handler(trace.evlist, trace__event_handler); -- GitLab From ff0c107806cf9d237e50e21de66d6909391071cd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 11:14:06 -0300 Subject: [PATCH 156/705] perf evsel: Rename print_ip() to fprintf_sym() As it receives a FILE, and its more than just the IP, which can even be requested not to be printed. For consistency with other similar methods in tools/perf/, name it as perf_evsel__fprintf_sym() and make it return the number of bytes printed, just like 'fprintf(3)' Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-84gawlqa3lhk63nf0t9vnqnn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 10 +++---- tools/perf/builtin-trace.c | 4 +-- tools/perf/util/session.c | 60 +++++++++++++++++-------------------- tools/perf/util/session.h | 8 ++--- 4 files changed, 39 insertions(+), 43 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 60fde9f5025c1..ddd5b79e94c27 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -579,8 +579,8 @@ static void print_sample_bts(struct perf_sample *sample, print_opts &= ~PRINT_IP_OPT_SRCLINE; } } - perf_evsel__print_ip(evsel, sample, al, 0, print_opts, - scripting_max_stack, stdout); + perf_evsel__fprintf_sym(evsel, sample, al, 0, print_opts, + scripting_max_stack, stdout); } /* print branch_to information */ @@ -788,9 +788,9 @@ static void process_event(struct perf_script *script, else printf("\n"); - perf_evsel__print_ip(evsel, sample, al, 0, - output[attr->type].print_ip_opts, - scripting_max_stack, stdout); + perf_evsel__fprintf_sym(evsel, sample, al, 0, + output[attr->type].print_ip_opts, + scripting_max_stack, stdout); } if (PRINT_FIELD(IREGS)) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8c587a8d3742a..a0d5c680c39ee 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2203,8 +2203,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, event->header.type); goto out_put; } - perf_evsel__print_ip(evsel, sample, &al, 38, print_opts, - scripting_max_stack, trace->output); + perf_evsel__fprintf_sym(evsel, sample, &al, 38, print_opts, + scripting_max_stack, trace->output); } out: ttrace->entry_pending = false; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 62b6d4051b99c..0669a088ea0d4 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1953,11 +1953,12 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) +int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) { + int printed = 0; struct callchain_cursor_node *node; int print_ip = print_opts & PRINT_IP_OPT_IP; int print_sym = print_opts & PRINT_IP_OPT_SYM; @@ -1975,7 +1976,7 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, stack_depth) != 0) { if (verbose) error("Failed to resolve callchain. Skipping\n"); - return; + return printed; } callchain_cursor_commit(&callchain_cursor); @@ -1992,71 +1993,66 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, if (node->sym && node->sym->ignore) goto next; - fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); if (print_ip) - fprintf(fp, "%c%16" PRIx64, s, node->ip); + printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); if (node->map) addr = node->map->map_ip(node->map, node->ip); if (print_sym) { - fprintf(fp, " "); + printed += fprintf(fp, " "); if (print_symoffset) { node_al.addr = addr; node_al.map = node->map; - symbol__fprintf_symname_offs(node->sym, - &node_al, - fp); + printed += symbol__fprintf_symname_offs(node->sym, &node_al, fp); } else - symbol__fprintf_symname(node->sym, fp); + printed += symbol__fprintf_symname(node->sym, fp); } if (print_dso) { - fprintf(fp, " ("); - map__fprintf_dsoname(node->map, fp); - fprintf(fp, ")"); + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(node->map, fp); + printed += fprintf(fp, ")"); } if (print_srcline) - map__fprintf_srcline(node->map, addr, "\n ", - fp); + printed += map__fprintf_srcline(node->map, addr, "\n ", fp); if (!print_oneline) - fprintf(fp, "\n"); + printed += fprintf(fp, "\n"); stack_depth--; next: callchain_cursor_advance(&callchain_cursor); } - } else { - if (al->sym && al->sym->ignore) - return; - - fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + } else if (!(al->sym && al->sym->ignore)) { + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); if (print_ip) - fprintf(fp, "%16" PRIx64, sample->ip); + printed += fprintf(fp, "%16" PRIx64, sample->ip); if (print_sym) { - fprintf(fp, " "); + printed += fprintf(fp, " "); if (print_symoffset) - symbol__fprintf_symname_offs(al->sym, al, - fp); + printed += symbol__fprintf_symname_offs(al->sym, al, fp); else - symbol__fprintf_symname(al->sym, fp); + printed += symbol__fprintf_symname(al->sym, fp); } if (print_dso) { - fprintf(fp, " ("); - map__fprintf_dsoname(al->map, fp); - fprintf(fp, ")"); + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(al->map, fp); + printed += fprintf(fp, ")"); } if (print_srcline) - map__fprintf_srcline(al->map, al->addr, "\n ", fp); + printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); } + + return printed; } int perf_session__cpu_bitmap(struct perf_session *session, diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index a6bc4ddbae3e8..ac834908bb35d 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -104,10 +104,10 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp); +int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp); int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); -- GitLab From ea4539652eccc87b14fbcbc90467ebcb87f02ddb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 12:15:48 -0300 Subject: [PATCH 157/705] perf evsel: Introduce fprintf_callchain() method out of fprintf_sym() In 'perf trace' we're just interested in printing callchains, and we don't want to use the symbol_conf.use_callchain, so move the callchain part to a new method. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-kcn3romzivcpxb3u75s9nz33@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 4 ++-- tools/perf/util/evsel.h | 6 ++++++ tools/perf/util/session.c | 29 ++++++++++++++++++++++++----- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a0d5c680c39ee..63a3cc9b717c4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2203,8 +2203,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, event->header.type); goto out_put; } - perf_evsel__fprintf_sym(evsel, sample, &al, 38, print_opts, - scripting_max_stack, trace->output); + perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts, + scripting_max_stack, trace->output); } out: ttrace->entry_pending = false; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 501ea6e565f13..ab3632caba9f5 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -381,6 +381,12 @@ struct perf_attr_details { int perf_evsel__fprintf(struct perf_evsel *evsel, struct perf_attr_details *details, FILE *fp); +int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, + struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, + unsigned int stack_depth, FILE *fp); + bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize); int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0669a088ea0d4..e384b651a3e86 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1953,10 +1953,10 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) +int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) { int printed = 0; struct callchain_cursor_node *node; @@ -1968,7 +1968,7 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; char s = print_oneline ? ' ' : '\t'; - if (symbol_conf.use_callchain && sample->callchain) { + if (sample->callchain) { struct addr_location node_al; if (thread__resolve_callchain(al->thread, evsel, @@ -2027,7 +2027,26 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample next: callchain_cursor_advance(&callchain_cursor); } + } + + return printed; +} + +int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) +{ + int printed = 0; + int print_ip = print_opts & PRINT_IP_OPT_IP; + int print_sym = print_opts & PRINT_IP_OPT_SYM; + int print_dso = print_opts & PRINT_IP_OPT_DSO; + int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; + int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; + if (symbol_conf.use_callchain && sample->callchain) { + printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, + print_opts, stack_depth, fp); } else if (!(al->sym && al->sym->ignore)) { printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); -- GitLab From 44621819ddc9d5d0bfd0b0616c6cf33c94189b67 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 15:49:11 -0300 Subject: [PATCH 158/705] perf trace: Exclude the kernel part of the callchain leading to a syscall The kernel parts are not that useful: # trace -m 512 -e nanosleep --call dwarf usleep 1 0.065 ( 0.065 ms): usleep/18732 nanosleep(rqtp: 0x7ffc4ee4e200) = 0 syscall_slow_exit_work ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) return_from_SYSCALL_64 ([kernel.kallsyms]) __nanosleep (/usr/lib64/libc-2.22.so) usleep (/usr/lib64/libc-2.22.so) main (/usr/bin/usleep) __libc_start_main (/usr/lib64/libc-2.22.so) _start (/usr/bin/usleep) # So lets just use perf_event_attr.exclude_callchain_kernel to avoid collecting it in the ring buffer: # trace -m 512 -e nanosleep --call dwarf usleep 1 0.063 ( 0.063 ms): usleep/19212 nanosleep(rqtp: 0x7ffc3df10fb0) = 0 __nanosleep (/usr/lib64/libc-2.22.so) usleep (/usr/lib64/libc-2.22.so) main (/usr/bin/usleep) __libc_start_main (/usr/lib64/libc-2.22.so) _start (/usr/bin/usleep) # Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-qctu3gqhpim0dfbcp9d86c91@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 3 +++ tools/perf/builtin-trace.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index ed485df16409e..1bbcf305d2331 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -123,6 +123,9 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. man pages for details. The ones that are most useful in 'perf trace' are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'. +--kernel-syscall-graph:: + Show the kernel callchains on the syscall exit path. + --event:: Trace other events, see 'perf list' for a complete list. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 63a3cc9b717c4..cfa5ce8fdb7bf 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -159,6 +159,7 @@ struct trace { bool show_comm; bool show_tool_stats; bool trace_syscalls; + bool kernel_syscallchains; bool force; bool vfs_getname; int trace_pgfaults; @@ -2661,6 +2662,15 @@ static int trace__add_syscall_newtp(struct trace *trace) perf_evlist__add(evlist, sys_enter); perf_evlist__add(evlist, sys_exit); + if (trace->opts.callgraph_set && !trace->kernel_syscallchains) { + /* + * We're interested only in the user space callchain + * leading to the syscall, allow overriding that for + * debugging reasons using --kernel_syscall_callchains + */ + sys_exit->attr.exclude_callchain_kernel = 1; + } + trace->syscalls.events.sys_enter = sys_enter; trace->syscalls.events.sys_exit = sys_exit; @@ -3221,6 +3231,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .output = stderr, .show_comm = true, .trace_syscalls = true, + .kernel_syscallchains = false, }; const char *output_name = NULL; const char *ev_qualifier_str = NULL; @@ -3269,6 +3280,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) OPT_CALLBACK(0, "call-graph", &trace.opts, "record_mode[,record_size]", record_callchain_help, &record_parse_callchain_opt), + OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, + "Show the kernel callchains on the syscall exit path"), OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), OPT_END() -- GitLab From e68ae9cf7d734e669bc0a981b4154f70d29b5059 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 18:15:29 -0300 Subject: [PATCH 159/705] perf evsel: Do not use globals in config() Instead receive a callchain_param pointer to configure callchain aspects, not doing so if NULL is passed. This will allow fine grained control over which evsels in an evlist gets callchains enabled. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-2mupip6khc92mh5x4nw9to82@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/util/evlist.h | 5 ++++- tools/perf/util/evsel.c | 7 ++++--- tools/perf/util/evsel.h | 5 ++++- tools/perf/util/record.c | 5 +++-- 15 files changed, 26 insertions(+), 18 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 9d29ee283ac53..d4aa567a29c46 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -71,7 +71,7 @@ int test__perf_time_to_tsc(int subtest __maybe_unused) CHECK__(parse_events(evlist, "cycles:u", NULL)); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index bff666458b28e..6487c06d27085 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -982,7 +982,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm) struct perf_evlist *evlist = kvm->evlist; char sbuf[STRERR_BUFSIZE]; - perf_evlist__config(evlist, &kvm->opts); + perf_evlist__config(evlist, &kvm->opts, NULL); /* * Note: exclude_{guest,host} do not apply here. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e64bd1ee5acb2..eb6a199a833c2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -284,7 +284,7 @@ static int record__open(struct record *rec) struct record_opts *opts = &rec->opts; int rc = 0; - perf_evlist__config(evlist, opts); + perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each(evlist, pos) { try_again: diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 833214979c4f4..8846df0ec0c3f 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -886,7 +886,7 @@ static int perf_top__start_counters(struct perf_top *top) struct perf_evlist *evlist = top->evlist; struct record_opts *opts = &top->record_opts; - perf_evlist__config(evlist, opts); + perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each(evlist, counter) { try_again: diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index cfa5ce8fdb7bf..08fb100b91fa5 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2749,7 +2749,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config(evlist, &trace->opts); + perf_evlist__config(evlist, &trace->opts, &callchain_param); signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 199501c71e272..f31eed31c1a9c 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -138,7 +138,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), perf_evlist__splice_list_tail(evlist, &parse_evlist.list); evlist->nr_groups = parse_evlist.nr_groups; - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); err = perf_evlist__open(evlist); if (err < 0) { diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index abd3f0ec0c0b8..68a69a195545e 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -532,7 +532,7 @@ static int do_test_code_reading(bool try_kcore) goto out_put; } - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index ddb78fae064a5..614e45a3c6038 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -80,7 +80,7 @@ int test__keep_tracking(int subtest __maybe_unused) CHECK__(parse_events(evlist, "dummy:u", NULL)); CHECK__(parse_events(evlist, "cycles:u", NULL)); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index eb99a105f31ce..4344fe482c1d2 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -44,7 +44,7 @@ int test__syscall_openat_tp_fields(int subtest __maybe_unused) goto out_delete_evlist; } - perf_evsel__config(evsel, &opts); + perf_evsel__config(evsel, &opts, NULL); thread_map__set_pid(evlist->threads, 0, getpid()); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 1cc78cefe3990..b836ee6a8d9bb 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -99,7 +99,7 @@ int test__PERF_RECORD(int subtest __maybe_unused) perf_evsel__set_sample_bit(evsel, CPU); perf_evsel__set_sample_bit(evsel, TID); perf_evsel__set_sample_bit(evsel, TIME); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); if (err < 0) { diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index ebd80168d51e8..39a689bf7574e 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -417,7 +417,7 @@ int test__switch_tracking(int subtest __maybe_unused) perf_evsel__set_sample_bit(tracking_evsel, TIME); /* Config events */ - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); /* Check moved event is still at the front */ if (cycles_evsel != perf_evlist__first(evlist)) { diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a0d15221db6e8..8db9228663d6f 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -123,11 +123,14 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx); int perf_evlist__open(struct perf_evlist *evlist); void perf_evlist__close(struct perf_evlist *evlist); +struct callchain_param; + void perf_evlist__set_id_pos(struct perf_evlist *evlist); bool perf_can_sample_identifier(void); bool perf_can_record_switch_events(void); bool perf_can_record_cpu_wide(void); -void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts); +void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts, + struct callchain_param *callchain); int record_opts__config(struct record_opts *opts); int perf_evlist__prepare_workload(struct perf_evlist *evlist, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3fd7c2c72f4ad..84252729222d9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -737,7 +737,8 @@ static void apply_config_terms(struct perf_evsel *evsel, * enable/disable events specifically, as there's no * initial traced exec call. */ -void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) +void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, + struct callchain_param *callchain) { struct perf_evsel *leader = evsel->leader; struct perf_event_attr *attr = &evsel->attr; @@ -812,8 +813,8 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) if (perf_evsel__is_function_event(evsel)) evsel->attr.exclude_callchain_user = 1; - if (callchain_param.enabled && !evsel->no_aux_samples) - perf_evsel__config_callgraph(evsel, opts, &callchain_param); + if (callchain && callchain->enabled && !evsel->no_aux_samples) + perf_evsel__config_callgraph(evsel, opts, callchain); if (opts->sample_intr_regs) { attr->sample_regs_intr = opts->sample_intr_regs; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ab3632caba9f5..7e45d2130a0fd 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -178,8 +178,11 @@ void perf_evsel__init(struct perf_evsel *evsel, void perf_evsel__exit(struct perf_evsel *evsel); void perf_evsel__delete(struct perf_evsel *evsel); +struct callchain_param; + void perf_evsel__config(struct perf_evsel *evsel, - struct record_opts *opts); + struct record_opts *opts, + struct callchain_param *callchain); int __perf_evsel__sample_size(u64 sample_type); void perf_evsel__calc_id_pos(struct perf_evsel *evsel); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 0467367dc3155..481792c7484bd 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -129,7 +129,8 @@ bool perf_can_record_cpu_wide(void) return true; } -void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) +void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts, + struct callchain_param *callchain) { struct perf_evsel *evsel; bool use_sample_identifier = false; @@ -148,7 +149,7 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) use_comm_exec = perf_can_comm_exec(); evlist__for_each(evlist, evsel) { - perf_evsel__config(evsel, opts); + perf_evsel__config(evsel, opts, callchain); if (evsel->tracking && use_comm_exec) evsel->attr.comm_exec = 1; } -- GitLab From 22c8a376b55f327f7a25a318e87ba9202ba284bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 18:37:45 -0300 Subject: [PATCH 160/705] perf evlist: Add (reset,set)_sample_bit methods For fiddling with sample_type fields in all evsels in an evlist. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-dg6yavctt0hzl2tsgfb43qsr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 18 ++++++++++++++++++ tools/perf/util/evlist.h | 11 +++++++++++ 2 files changed, 29 insertions(+) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 86a03836a83fc..4c9f510ae18da 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1192,6 +1192,24 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus, perf_evlist__propagate_maps(evlist); } +void __perf_evlist__set_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) + __perf_evsel__set_sample_bit(evsel, bit); +} + +void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) + __perf_evsel__reset_sample_bit(evsel, bit); +} + int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel) { struct perf_evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 8db9228663d6f..da46423998e8c 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -87,6 +87,17 @@ int perf_evlist__add_dummy(struct perf_evlist *evlist); int perf_evlist__add_newtp(struct perf_evlist *evlist, const char *sys, const char *name, void *handler); +void __perf_evlist__set_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit); +void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit); + +#define perf_evlist__set_sample_bit(evlist, bit) \ + __perf_evlist__set_sample_bit(evlist, PERF_SAMPLE_##bit) + +#define perf_evlist__reset_sample_bit(evlist, bit) \ + __perf_evlist__reset_sample_bit(evlist, PERF_SAMPLE_##bit) + int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter); int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid); int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids); -- GitLab From 01e0d50c3f95cb1bae2dbfd83173bc2864d6d28c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 18:39:37 -0300 Subject: [PATCH 161/705] perf evsel: Rename config_callgraph() to config_callchain() and make it public The rename is for consistency with the parameter name. Make it public for fine grained control of which evsels should have callchains enabled, like, for instance, will be done in the next changesets in 'perf trace', to enable callchains just on the "raw_syscalls:sys_exit" tracepoint. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-og8vup111rn357g4yagus3ao@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 11 +++++------ tools/perf/util/evsel.h | 3 +++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 84252729222d9..d475a4ec8b570 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -562,10 +562,9 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size) return ret; } -static void -perf_evsel__config_callgraph(struct perf_evsel *evsel, - struct record_opts *opts, - struct callchain_param *param) +void perf_evsel__config_callchain(struct perf_evsel *evsel, + struct record_opts *opts, + struct callchain_param *param) { bool function = perf_evsel__is_function_event(evsel); struct perf_event_attr *attr = &evsel->attr; @@ -705,7 +704,7 @@ static void apply_config_terms(struct perf_evsel *evsel, /* set perf-event callgraph */ if (param.enabled) - perf_evsel__config_callgraph(evsel, opts, ¶m); + perf_evsel__config_callchain(evsel, opts, ¶m); } } @@ -814,7 +813,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, evsel->attr.exclude_callchain_user = 1; if (callchain && callchain->enabled && !evsel->no_aux_samples) - perf_evsel__config_callgraph(evsel, opts, callchain); + perf_evsel__config_callchain(evsel, opts, callchain); if (opts->sample_intr_regs) { attr->sample_regs_intr = opts->sample_intr_regs; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 7e45d2130a0fd..1bd6c2e02dfa1 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -183,6 +183,9 @@ struct callchain_param; void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, struct callchain_param *callchain); +void perf_evsel__config_callchain(struct perf_evsel *evsel, + struct record_opts *opts, + struct callchain_param *callchain); int __perf_evsel__sample_size(u64 sample_type); void perf_evsel__calc_id_pos(struct perf_evsel *evsel); -- GitLab From fde54b7860ffff1c93e6b9abb3fbc3b8b95f2695 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 18:42:37 -0300 Subject: [PATCH 162/705] perf trace: Make "--call-graph" affect just "raw_syscalls:sys_exit" We don't need the callchains at the syscall enter tracepoint, just when finishing it at syscall exit, so reduce the overhead by asking for callchains just at syscall exit. Suggested-by: Milian Wolff Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-fja1ods5vqpg42mdz09xcz3r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 08fb100b91fa5..60ab7ce3bc90d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2749,7 +2749,27 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config(evlist, &trace->opts, &callchain_param); + perf_evlist__config(evlist, &trace->opts, NULL); + + if (trace->opts.callgraph_set && trace->syscalls.events.sys_exit) { + perf_evsel__config_callchain(trace->syscalls.events.sys_exit, + &trace->opts, &callchain_param); + /* + * Now we have evsels with different sample_ids, use + * PERF_SAMPLE_IDENTIFIER to map from sample to evsel + * from a fixed position in each ring buffer record. + * + * As of this the changeset introducing this comment, this + * isn't strictly needed, as the fields that can come before + * PERF_SAMPLE_ID are all used, but we'll probably disable + * some of those for things like copying the payload of + * pointer syscall arguments, and for vfs_getname we don't + * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this + * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. + */ + perf_evlist__set_sample_bit(evlist, IDENTIFIER); + perf_evlist__reset_sample_bit(evlist, ID); + } signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); -- GitLab From fd4be13067ef65bf33b965a18c717889305d5fea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 22:03:56 -0300 Subject: [PATCH 163/705] perf evsel: Allow unresolved symbol names to be printed as addresses The fprintf_sym() and fprintf_callchain() methods now allow users to change the existing behaviour of showing "[unknown]" as the name of unresolved symbols to instead show "[0x123456]", i.e. its address. The current patch doesn't change tools to use this facility, the results from 'perf trace' and 'perf script' cotinue like: 70.109 ( 0.001 ms): qemu-system-x8/10153 poll(ufds: 0x7f2d93ffe870, nfds: 1) = 0 Timeout [unknown] (/usr/lib64/libc-2.22.so) [unknown] (/usr/lib64/libspice-server.so.1.10.0) [unknown] (/usr/lib64/libspice-server.so.1.10.0) [unknown] (/usr/lib64/libspice-server.so.1.10.0) start_thread+0xca (/usr/lib64/libpthread-2.22.so) __clone+0x6d (/usr/lib64/libc-2.22.so) The next patch will make 'perf trace' use the new formatting. Suggested-by: Milian Wolff Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-fja1ods5vqpg42mdz09xcz3r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 27 ++++++++++++++++++--------- tools/perf/util/session.h | 1 + tools/perf/util/symbol.c | 25 +++++++++++++++++++++---- tools/perf/util/symbol.h | 6 ++++++ 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e384b651a3e86..0516d06a2741a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1966,6 +1966,7 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; int print_oneline = print_opts & PRINT_IP_OPT_ONELINE; int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; + int print_unknown_as_addr = print_opts & PRINT_IP_OPT_UNKNOWN_AS_ADDR; char s = print_oneline ? ' ' : '\t'; if (sample->callchain) { @@ -2003,12 +2004,16 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * if (print_sym) { printed += fprintf(fp, " "); + node_al.addr = addr; + node_al.map = node->map; + if (print_symoffset) { - node_al.addr = addr; - node_al.map = node->map; - printed += symbol__fprintf_symname_offs(node->sym, &node_al, fp); - } else - printed += symbol__fprintf_symname(node->sym, fp); + printed += __symbol__fprintf_symname_offs(node->sym, &node_al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(node->sym, &node_al, + print_unknown_as_addr, fp); + } } if (print_dso) { @@ -2043,6 +2048,7 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample int print_dso = print_opts & PRINT_IP_OPT_DSO; int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; + int print_unknown_as_addr = print_opts & PRINT_IP_OPT_UNKNOWN_AS_ADDR; if (symbol_conf.use_callchain && sample->callchain) { printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, @@ -2055,10 +2061,13 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample if (print_sym) { printed += fprintf(fp, " "); - if (print_symoffset) - printed += symbol__fprintf_symname_offs(al->sym, al, fp); - else - printed += symbol__fprintf_symname(al->sym, fp); + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(al->sym, al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(al->sym, al, + print_unknown_as_addr, fp); + } } if (print_dso) { diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index ac834908bb35d..4257fac566186 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -42,6 +42,7 @@ struct perf_session { #define PRINT_IP_OPT_SYMOFFSET (1<<3) #define PRINT_IP_OPT_ONELINE (1<<4) #define PRINT_IP_OPT_SRCLINE (1<<5) +#define PRINT_IP_OPT_UNKNOWN_AS_ADDR (1<<6) struct perf_tool; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e7588dc915181..bb162ee433c62 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -264,8 +264,9 @@ size_t symbol__fprintf(struct symbol *sym, FILE *fp) sym->name); } -size_t symbol__fprintf_symname_offs(const struct symbol *sym, - const struct addr_location *al, FILE *fp) +size_t __symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) { unsigned long offset; size_t length; @@ -280,13 +281,29 @@ size_t symbol__fprintf_symname_offs(const struct symbol *sym, length += fprintf(fp, "+0x%lx", offset); } return length; - } else + } else if (al && unknown_as_addr) + return fprintf(fp, "[%#" PRIx64 "]", al->addr); + else return fprintf(fp, "[unknown]"); } +size_t symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, false, fp); +} + +size_t __symbol__fprintf_symname(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp); +} + size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp) { - return symbol__fprintf_symname_offs(sym, NULL, fp); + return __symbol__fprintf_symname_offs(sym, NULL, false, fp); } void symbols__delete(struct rb_root *symbols) diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index c8b7544d92675..e2562568418d8 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -262,8 +262,14 @@ int symbol__init(struct perf_env *env); void symbol__exit(void); void symbol__elf_init(void); struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name); +size_t __symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp); size_t symbol__fprintf_symname_offs(const struct symbol *sym, const struct addr_location *al, FILE *fp); +size_t __symbol__fprintf_symname(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp); size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp); size_t symbol__fprintf(struct symbol *sym, FILE *fp); bool symbol_type__is_a(char symbol_type, enum map_type map_type); -- GitLab From 00768a2bd3245eace0690fcf2c02776a256b66d7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 11 Apr 2016 22:08:55 -0300 Subject: [PATCH 164/705] perf trace: Print unresolved symbol names as addresses Instead of having "[unknown]" as the name used for unresolved symbols, use the address in the callchain, in hexadecimal form: 28.801 ( 0.007 ms): qemu-system-x8/10065 ppoll(ufds: 0x55c98b39e400, nfds: 72, tsp: 0x7fffe4e4fe60, sigsetsize: 8) = 0 Timeout ppoll+0x91 (/usr/lib64/libc-2.22.so) [0x337309] (/usr/bin/qemu-system-x86_64) [0x336ab4] (/usr/bin/qemu-system-x86_64) main+0x1724 (/usr/bin/qemu-system-x86_64) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) [0xc59a9] (/usr/bin/qemu-system-x86_64) 35.265 (14.805 ms): gnome-shell/2287 ... [continued]: poll()) = 1 [0xf6fdd] (/usr/lib64/libc-2.22.so) g_main_context_iterate.isra.29+0x17c (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_loop_run+0xc2 (/usr/lib64/libglib-2.0.so.0.4600.2) meta_run+0x2c (/usr/lib64/libmutter.so.0.0.0) main+0x3f7 (/usr/bin/gnome-shell) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) [0x2909] (/usr/bin/gnome-shell) Suggested-by: Milian Wolff Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-fja1ods5vqpg42mdz09xcz3r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 60ab7ce3bc90d..2ec53edcf6492 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2196,8 +2196,9 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, if (sample->callchain) { struct addr_location al; /* TODO: user-configurable print_opts */ - const unsigned int print_opts = PRINT_IP_OPT_SYM - | PRINT_IP_OPT_DSO; + const unsigned int print_opts = PRINT_IP_OPT_SYM | + PRINT_IP_OPT_DSO | + PRINT_IP_OPT_UNKNOWN_AS_ADDR; if (machine__resolve(trace->host, &al, sample) < 0) { pr_err("problem processing %d event, skipping it.\n", -- GitLab From a5052657c164107032d521f0d9e92703d78845f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Apr 2016 08:52:49 -0700 Subject: [PATCH 165/705] locking/Documentation: Clarify relationship of barrier() to control dependencies The current documentation claims that the compiler ignores barrier(), which is not the case. Instead, the compiler carefully pays attention to barrier(), but in a creative way that still manages to destroy the control dependency. This commit sets the story straight. Reported-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dhowells@redhat.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 3729cbe60e416..ec12890423968 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -813,9 +813,10 @@ In summary: the same variable, then those stores must be ordered, either by preceding both of them with smp_mb() or by using smp_store_release() to carry out the stores. Please note that it is -not- sufficient - to use barrier() at beginning of each leg of the "if" statement, - as optimizing compilers do not necessarily respect barrier() - in this case. + to use barrier() at beginning of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. (*) Control dependencies require at least one run-time conditional between the prior load and the subsequent store, and this -- GitLab From 166bda7122c8e817f039bf738cf05ab3b7278732 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Apr 2016 08:52:50 -0700 Subject: [PATCH 166/705] locking/Documentation: Fix missed s/lock/acquire renames The terms 'lock'/'unlock' were changed to 'acquire'/'release' by the following commit: 2e4f5382d12a4 ("locking/doc: Rename LOCK/UNLOCK to ACQUIRE/RELEASE") However, the commit missed to change the table of contents - fix that. Also, the dumb rename changed the section name 'Locking functions' to an actively misleading 'Acquiring functions' section name. Rename it to 'Lock acquisition functions' instead. Suggested-by: David Howells Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-2-git-send-email-paulmck@linux.vnet.ibm.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index ec12890423968..38b1ce161afb3 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -31,15 +31,15 @@ Contents: (*) Implicit kernel memory barriers. - - Locking functions. + - Lock acquisition functions. - Interrupt disabling functions. - Sleep and wake-up functions. - Miscellaneous functions. - (*) Inter-CPU locking barrier effects. + (*) Inter-CPU acquiring barrier effects. - - Locks vs memory accesses. - - Locks vs I/O accesses. + - Acquires vs memory accesses. + - Acquires vs I/O accesses. (*) Where are memory barriers needed? @@ -1859,7 +1859,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly ordered I/O regions to be partially ordered. Its effects may go beyond the CPU->Hardware interface and actually affect the hardware at some level. -See the subsection "Locks vs I/O accesses" for more information. +See the subsection "Acquires vs I/O accesses" for more information. =============================== @@ -1874,8 +1874,8 @@ provide more substantial guarantees, but these may not be relied upon outside of arch specific code. -ACQUIRING FUNCTIONS -------------------- +LOCK ACQUISITION FUNCTIONS +-------------------------- The Linux kernel has a number of locking constructs: -- GitLab From 01e1cd6de8e75fa28c268b4dc566bc1a39486e71 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Apr 2016 08:52:51 -0700 Subject: [PATCH 167/705] locking/Documentation: Add missed subsection in TOC A 'Virtual Machine Guests' subsection was added by this commit: 6a65d26385bf487 ("asm-generic: implement virt_xxx memory barriers") but the TOC was not updated - update it. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Acked-by: David Howells Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-3-git-send-email-paulmck@linux.vnet.ibm.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 38b1ce161afb3..718ef2564fa03 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -61,6 +61,7 @@ Contents: (*) The things CPUs get up to. - And then there's the Alpha. + - Virtual Machine Guests. (*) Example uses. -- GitLab From 3dbf0913f6cac722805a94f16b1e61ffc3483eaf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Apr 2016 08:52:52 -0700 Subject: [PATCH 168/705] locking/Documentation: Fix formatting inconsistencies Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Acked-by: David Howells Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-4-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 718ef2564fa03..1f15418622390 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -149,7 +149,7 @@ As a further example, consider this sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; Q = P; P = &B D = *Q; @@ -518,7 +518,7 @@ following sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; WRITE_ONCE(P, &B) @@ -545,7 +545,7 @@ between the address load and the data load: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; WRITE_ONCE(P, &B); @@ -3043,7 +3043,7 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. VIRTUAL MACHINE GUESTS -------------------- +---------------------- Guests running within virtual machines might be affected by SMP effects even if the guest itself is compiled without SMP support. This is an artifact of -- GitLab From 0b6fa347dc08c6f757a35f3a180269b3ffc4cd28 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Apr 2016 08:52:53 -0700 Subject: [PATCH 169/705] locking/Documentation: Insert white spaces consistently The document uses two newlines between sections, one newline between item and its detailed description, and two spaces between sentences. There are a few places that used these rules inconsistently - fix them. Signed-off-by: SeongJae Park Signed-off-by: Paul E. McKenney Acked-by: David Howells Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-5-git-send-email-paulmck@linux.vnet.ibm.com [ Fixed the changelog. ] Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 43 +++++++++++++++++-------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 1f15418622390..7133626a61d03 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1733,15 +1733,15 @@ The Linux kernel has eight basic CPU memory barriers: All memory barriers except the data dependency barriers imply a compiler -barrier. Data dependencies do not impose any additional compiler ordering. +barrier. Data dependencies do not impose any additional compiler ordering. Aside: In the case of data dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b (eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) -tmp = a[b]; ). There is also the problem of a compiler reloading b after -having loaded a[b], thus having a newer copy of b than a[b]. A consensus +tmp = a[b]; ). There is also the problem of a compiler reloading b after +having loaded a[b], thus having a newer copy of b than a[b]. A consensus has not yet been reached about these problems, however the READ_ONCE() macro is a good place to start looking. @@ -1796,6 +1796,7 @@ There are some more advanced barrier functions: (*) lockless_dereference(); + This can be thought of as a pointer-fetch wrapper around the smp_read_barrier_depends() data-dependency barrier. @@ -1897,7 +1898,7 @@ for each construct. These operations all imply certain barriers: Memory operations issued before the ACQUIRE may be completed after the ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! + subsequent loads and stores. Note that this is weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on many architectures. (2) RELEASE operation implication: @@ -2092,9 +2093,9 @@ or: event_indicated = 1; wake_up_process(event_daemon); -A write memory barrier is implied by wake_up() and co. if and only if they wake -something up. The barrier occurs before the task state is cleared, and so sits -between the STORE to indicate the event and the STORE to set TASK_RUNNING: +A write memory barrier is implied by wake_up() and co. if and only if they +wake something up. The barrier occurs before the task state is cleared, and so +sits between the STORE to indicate the event and the STORE to set TASK_RUNNING: CPU 1 CPU 2 =============================== =============================== @@ -2208,7 +2209,7 @@ three CPUs; then should the following sequence of events occur: Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks -on the separate CPUs. It might, for example, see: +on the separate CPUs. It might, for example, see: *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M @@ -2488,9 +2489,9 @@ The following operations are special locking primitives: clear_bit_unlock(); __clear_bit_unlock(); -These implement ACQUIRE-class and RELEASE-class operations. These should be used in -preference to other operations when implementing locking primitives, because -their implementations can be optimised on many architectures. +These implement ACQUIRE-class and RELEASE-class operations. These should be +used in preference to other operations when implementing locking primitives, +because their implementations can be optimised on many architectures. [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory @@ -2570,12 +2571,12 @@ explicit barriers are used. Normally this won't be a problem because the I/O accesses done inside such sections will include synchronous load operations on strictly ordered I/O -registers that form implicit I/O barriers. If this isn't sufficient then an +registers that form implicit I/O barriers. If this isn't sufficient then an mmiowb() may need to be used explicitly. A similar situation may occur between an interrupt routine and two routines -running on separate CPUs that communicate with each other. If such a case is +running on separate CPUs that communicate with each other. If such a case is likely, then interrupt-disabling locks should be used to guarantee ordering. @@ -2589,8 +2590,8 @@ functions: (*) inX(), outX(): These are intended to talk to I/O space rather than memory space, but - that's primarily a CPU-specific concept. The i386 and x86_64 processors do - indeed have special I/O space access cycles and instructions, but many + that's primarily a CPU-specific concept. The i386 and x86_64 processors + do indeed have special I/O space access cycles and instructions, but many CPUs don't have such a concept. The PCI bus, amongst others, defines an I/O space concept which - on such @@ -2612,7 +2613,7 @@ functions: Whether these are guaranteed to be fully ordered and uncombined with respect to each other on the issuing CPU depends on the characteristics - defined for the memory window through which they're accessing. On later + defined for the memory window through which they're accessing. On later i386 architecture machines, for example, this is controlled by way of the MTRR registers. @@ -2637,10 +2638,10 @@ functions: (*) readX_relaxed(), writeX_relaxed() These are similar to readX() and writeX(), but provide weaker memory - ordering guarantees. Specifically, they do not guarantee ordering with + ordering guarantees. Specifically, they do not guarantee ordering with respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee - ordering with respect to LOCK or UNLOCK operations. If the latter is - required, an mmiowb() barrier can be used. Note that relaxed accesses to + ordering with respect to LOCK or UNLOCK operations. If the latter is + required, an mmiowb() barrier can be used. Note that relaxed accesses to the same peripheral are guaranteed to be ordered with respect to each other. @@ -3042,6 +3043,7 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. + VIRTUAL MACHINE GUESTS ---------------------- @@ -3052,7 +3054,7 @@ barriers for this use-case would be possible but is often suboptimal. To handle this case optimally, low-level virt_mb() etc macros are available. These have the same effect as smp_mb() etc when SMP is enabled, but generate -identical code for SMP and non-SMP systems. For example, virtual machine guests +identical code for SMP and non-SMP systems. For example, virtual machine guests should use virt_mb() rather than smp_mb() when synchronizing against a (possibly SMP) host. @@ -3060,6 +3062,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects, in particular, they do not control MMIO effects: to control MMIO effects, use mandatory barriers. + ============ EXAMPLE USES ============ -- GitLab From 787df6383caa1338a4f6640d71917bc2d8c068b1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 12 Apr 2016 08:52:55 -0700 Subject: [PATCH 170/705] locking/Documentation: Mention smp_cond_acquire() ... do this next to smp_load_acquire() when first mentioning ACQUIRE. While this call is briefly explained and control dependencies are mentioned later, it does not hurt the reader. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dhowells@redhat.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476375-27803-7-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 7133626a61d03..a9454b1c73bd4 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -431,8 +431,9 @@ And a couple of implicit varieties: This acts as a one-way permeable barrier. It guarantees that all memory operations after the ACQUIRE operation will appear to happen after the ACQUIRE operation with respect to the other components of the system. - ACQUIRE operations include LOCK operations and smp_load_acquire() - operations. + ACQUIRE operations include LOCK operations and both smp_load_acquire() + and smp_cond_acquire() operations. The later builds the necessary ACQUIRE + semantics from relying on a control dependency and smp_rmb(). Memory operations that occur before an ACQUIRE operation may appear to happen after it completes. -- GitLab From 1f190931893a98ffd5d4cfdfbfc2452ad0ed3e1b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 12 Apr 2016 08:47:17 -0700 Subject: [PATCH 171/705] locking/locktorture: Fix deboosting NULL pointer dereference For the case of rtmutex torturing we will randomly call into the boost() handler, including upon module exiting when the tasks are deboosted before stopping. In such cases the task may or may not have already been boosted, and therefore the NULL being explicitly passed can occur anywhere. Currently we only assume that the task will is at a higher prio, and in consequence, dereference a NULL pointer. This patch fixes the case of a rmmod locktorture exploding while pounding on the rtmutex lock (partial trace): task: ffff88081026cf80 ti: ffff880816120000 task.ti: ffff880816120000 RSP: 0018:ffff880816123eb0 EFLAGS: 00010206 RAX: ffff88081026cf80 RBX: ffff880816bfa630 RCX: 0000000000160d1b RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: ffff88081026cf80 R08: 000000000000001f R09: ffff88017c20ca80 R10: 0000000000000000 R11: 000000000048c316 R12: ffffffffa05d1840 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88203f880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000001c0a000 CR4: 00000000000406e0 Stack: ffffffffa05d141d ffff880816bfa630 ffffffffa05d1922 ffff88081e70c2c0 ffff880816bfa630 ffffffff81095fed 0000000000000000 ffffffff8107bf60 ffff880816bfa630 ffffffff00000000 ffff880800000000 ffff880816123f08 Call Trace: [] kthread+0xbd/0xe0 [] ret_from_fork+0x3f/0x70 This patch ensures that if the random state pointer is not NULL and current is not boosted, then do nothing. RIP: 0010:[] [] torture_random+0x5/0x60 [torture] [] torture_rtmutex_boost+0x1d/0x90 [locktorture] [] lock_torture_writer+0xe2/0x170 [locktorture] Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dhowells@redhat.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476038-27060-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/locktorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b24..9e9c5f454f5c9 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -394,12 +394,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ -- GitLab From c1c33b92db4fb274dfbff778ccf2459e4bebd48e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 12 Apr 2016 08:47:18 -0700 Subject: [PATCH 172/705] locking/locktorture: Fix NULL pointer dereference for cleanup paths It has been found that paths that invoke cleanups through lock_torture_cleanup() can trigger NULL pointer dereferencing bugs during the statistics printing phase. This is mainly because we should not be calling into statistics before we are sure things have been set up correctly. Specifically, early checks (and the need for handling this in the cleanup call) only include parameter checks and basic statistics allocation. Once we start write/read kthreads we then consider the test as started. As such, update the function in question to check for cxt.lwsa writer stats, if not set, we either have a bogus parameter or -ENOMEM situation and therefore only need to deal with general torture calls. Reported-and-tested-by: Kefeng Wang Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: dhowells@redhat.com Cc: dipankar@in.ibm.com Cc: dvhart@linux.intel.com Cc: edumazet@google.com Cc: fweisbec@gmail.com Cc: jiangshanlai@gmail.com Cc: josh@joshtriplett.org Cc: mathieu.desnoyers@efficios.com Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1460476038-27060-2-git-send-email-paulmck@linux.vnet.ibm.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/locktorture.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9e9c5f454f5c9..d066a50dc87e6 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -748,6 +748,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +785,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +880,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +889,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ -- GitLab From dac429874d8156d97460c61049e202b2dcc15df8 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 9 Apr 2016 13:17:29 +0200 Subject: [PATCH 173/705] uprobes/x86: Constify uprobe_xol_ops structures The uprobe_xol_ops structures are never modified, so declare them as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Cc: Andrew Morton Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/1460200649-32526-1-git-send-email-Julia.Lawall@lip6.fr Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index bf4db6eaec8fd..bd074151bfd64 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -578,7 +578,7 @@ static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) riprel_post_xol(auprobe, regs); } -static struct uprobe_xol_ops default_xol_ops = { +static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, @@ -695,7 +695,7 @@ static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) 0, insn->immediate.nbytes); } -static struct uprobe_xol_ops branch_xol_ops = { +static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; -- GitLab From c003ed928962a55eb446e78c544b1d7c4f6cb88a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 8 Apr 2016 20:58:46 +0200 Subject: [PATCH 174/705] locking/lockdep: Deinline register_lock_class(), save 2328 bytes This function compiles to 1328 bytes of machine code. Three callsites. Registering a new lock class is definitely not *that* time-critical to inline it. Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1460141926-13069-5-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ed9410936a220..7cc43ef856c1f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -708,7 +708,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; -- GitLab From 0051202f6ad5fd9c04d220343e66d1eb890f7b81 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:31:44 -0700 Subject: [PATCH 175/705] selftests/x86: Test the FSBASE/GSBASE API and context switching This catches two distinct bugs in the current code. I'll fix them. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7e5941148d1e2199f070dadcdf7355959f5f8e85.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 1 + tools/testing/selftests/x86/fsgsbase.c | 398 +++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 tools/testing/selftests/x86/fsgsbase.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index b47ebd1706907..c73425de3cfe7 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -9,6 +9,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer +TARGETS_C_64BIT_ONLY := fsgsbase TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c new file mode 100644 index 0000000000000..5b2b4b3c634ca --- /dev/null +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -0,0 +1,398 @@ +/* + * fsgsbase.c, an fsgsbase test + * Copyright (c) 2014-2016 Andy Lutomirski + * GPL v2 + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __x86_64__ +# error This test is 64-bit only +#endif + +static volatile sig_atomic_t want_segv; +static volatile unsigned long segv_addr; + +static int nerrs; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigsegv(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (!want_segv) { + clearhandler(SIGSEGV); + return; /* Crash cleanly. */ + } + + want_segv = false; + segv_addr = (unsigned long)si->si_addr; + + ctx->uc_mcontext.gregs[REG_RIP] += 4; /* Skip the faulting mov */ + +} + +enum which_base { FS, GS }; + +static unsigned long read_base(enum which_base which) +{ + unsigned long offset; + /* + * Unless we have FSGSBASE, there's no direct way to do this from + * user mode. We can get at it indirectly using signals, though. + */ + + want_segv = true; + + offset = 0; + if (which == FS) { + /* Use a constant-length instruction here. */ + asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax"); + } else { + asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax"); + } + if (!want_segv) + return segv_addr + offset; + + /* + * If that didn't segfault, try the other end of the address space. + * Unless we get really unlucky and run into the vsyscall page, this + * is guaranteed to segfault. + */ + + offset = (ULONG_MAX >> 1) + 1; + if (which == FS) { + asm volatile ("mov %%fs:(%%rcx), %%rax" + : : "c" (offset) : "rax"); + } else { + asm volatile ("mov %%gs:(%%rcx), %%rax" + : : "c" (offset) : "rax"); + } + if (!want_segv) + return segv_addr + offset; + + abort(); +} + +static void check_gs_value(unsigned long value) +{ + unsigned long base; + unsigned short sel; + + printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0) + err(1, "ARCH_SET_GS"); + + asm volatile ("mov %%gs, %0" : "=rm" (sel)); + base = read_base(GS); + if (base == value) { + printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n", + sel); + } else { + nerrs++; + printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n", + base, sel); + } + + if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0) + err(1, "ARCH_GET_GS"); + if (base == value) { + printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n", + sel); + } else { + nerrs++; + printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n", + base, sel); + } +} + +static void mov_0_gs(unsigned long initial_base, bool schedule) +{ + unsigned long base, arch_base; + + printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : ""); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0) + err(1, "ARCH_SET_GS"); + + if (schedule) + usleep(10); + + asm volatile ("mov %0, %%gs" : : "rm" (0)); + base = read_base(GS); + if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0) + err(1, "ARCH_GET_GS"); + if (base == arch_base) { + printf("[OK]\tGSBASE is 0x%lx\n", base); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base); + } +} + +static volatile unsigned long remote_base; +static volatile bool remote_hard_zero; +static volatile unsigned int ftx; + +/* + * ARCH_SET_FS/GS(0) may or may not program a selector of zero. HARD_ZERO + * means to force the selector to zero to improve test coverage. + */ +#define HARD_ZERO 0xa1fa5f343cb85fa4 + +static void do_remote_base() +{ + unsigned long to_set = remote_base; + bool hard_zero = false; + if (to_set == HARD_ZERO) { + to_set = 0; + hard_zero = true; + } + + if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0) + err(1, "ARCH_SET_GS"); + + if (hard_zero) + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + unsigned short sel; + asm volatile ("mov %%gs, %0" : "=rm" (sel)); + printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n", + to_set, hard_zero ? " and clear gs" : "", sel); +} + +void do_unexpected_base(void) +{ + /* + * The goal here is to try to arrange for GS == 0, GSBASE != + * 0, and for the the kernel the think that GSBASE == 0. + * + * To make the test as reliable as possible, this uses + * explicit descriptorss. (This is not the only way. This + * could use ARCH_SET_GS with a low, nonzero base, but the + * relevant side effect of ARCH_SET_GS could change.) + */ + + /* Step 1: tell the kernel that we have GSBASE == 0. */ + if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) + err(1, "ARCH_SET_GS"); + + /* Step 2: change GSBASE without telling the kernel. */ + struct user_desc desc = { + .entry_number = 0, + .base_addr = 0xBAADF00D, + .limit = 0xfffff, + .seg_32bit = 1, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 1, + .seg_not_present = 0, + .useable = 0 + }; + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { + printf("\tother thread: using LDT slot 0\n"); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + } else { + /* No modify_ldt for us (configured out, perhaps) */ + + struct user_desc *low_desc = mmap( + NULL, sizeof(desc), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); + memcpy(low_desc, &desc, sizeof(desc)); + + low_desc->entry_number = -1; + + /* 32-bit set_thread_area */ + long ret; + asm volatile ("int $0x80" + : "=a" (ret) : "a" (243), "b" (low_desc) + : "flags"); + memcpy(&desc, low_desc, sizeof(desc)); + munmap(low_desc, sizeof(desc)); + + if (ret != 0) { + printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); + return; + } + printf("\tother thread: using GDT slot %d\n", desc.entry_number); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); + } + + /* + * Step 3: set the selector back to zero. On AMD chips, this will + * preserve GSBASE. + */ + + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); +} + +static void *threadproc(void *ctx) +{ + while (1) { + while (ftx == 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); + if (ftx == 3) + return NULL; + + if (ftx == 1) + do_remote_base(); + else if (ftx == 2) + do_unexpected_base(); + else + errx(1, "helper thread got bad command"); + + ftx = 0; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + } +} + +static void set_gs_and_switch_to(unsigned long local, unsigned long remote) +{ + unsigned long base; + + bool hard_zero = false; + if (local == HARD_ZERO) { + hard_zero = true; + local = 0; + } + + printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n", + local, hard_zero ? " and clear gs" : "", remote); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0) + err(1, "ARCH_SET_GS"); + if (hard_zero) + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + if (read_base(GS) != local) { + nerrs++; + printf("[FAIL]\tGSBASE wasn't set as expected\n"); + } + + remote_base = remote; + ftx = 1; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + while (ftx != 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); + + base = read_base(GS); + if (base == local) { + printf("[OK]\tGSBASE remained 0x%lx\n", local); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); + } +} + +static void test_unexpected_base(void) +{ + unsigned long base; + + printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n"); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) + err(1, "ARCH_SET_GS"); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + ftx = 2; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + while (ftx != 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); + + base = read_base(GS); + if (base == 0) { + printf("[OK]\tGSBASE remained 0\n"); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); + } +} + +int main() +{ + pthread_t thread; + + sethandler(SIGSEGV, sigsegv, 0); + + check_gs_value(0); + check_gs_value(1); + check_gs_value(0x200000000); + check_gs_value(0); + check_gs_value(0x200000000); + check_gs_value(1); + + for (int sched = 0; sched < 2; sched++) { + mov_0_gs(0, !!sched); + mov_0_gs(1, !!sched); + mov_0_gs(0x200000000, !!sched); + } + + /* Set up for multithreading. */ + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + err(1, "sched_setaffinity to CPU 0"); /* should never fail */ + + if (pthread_create(&thread, 0, threadproc, 0) != 0) + err(1, "pthread_create"); + + static unsigned long bases_with_hard_zero[] = { + 0, HARD_ZERO, 1, 0x200000000, + }; + + for (int local = 0; local < 4; local++) { + for (int remote = 0; remote < 4; remote++) { + set_gs_and_switch_to(bases_with_hard_zero[local], + bases_with_hard_zero[remote]); + } + } + + test_unexpected_base(); + + ftx = 3; /* Kill the thread. */ + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + + if (pthread_join(thread, NULL) != 0) + err(1, "pthread_join"); + + return nerrs == 0 ? 0 : 1; +} -- GitLab From d47b50e7a111bb7a56fb1c974728b56209d7f515 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:31:45 -0700 Subject: [PATCH 176/705] x86/arch_prctl: Fix ARCH_GET_FS and ARCH_GET_GS ARCH_GET_FS and ARCH_GET_GS attempted to figure out the fsbase and gsbase respectively from saved thread state. This was wrong: fsbase and gsbase live in registers while a thread is running, not in memory. For reasons I can't fathom, the fsbase and gsbase code were different. Since neither was correct, I didn't try to figure out what the point of the difference was. Change it to simply read the MSRs. The code for reading the base for a remote thread is also completely wrong if the target thread uses its own descriptors (which is the case for all 32-bit threaded programs), but fixing that is a different story. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c6e7b507c72ca3bdbf6c7a8a3ceaa0334e873bd9.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6cbab31ac23a2..c671b9b015c0e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -566,10 +566,10 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) break; case ARCH_GET_FS: { unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) + if (doit) rdmsrl(MSR_FS_BASE, base); + else if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); else base = task->thread.fs; ret = put_user(base, (unsigned long __user *)addr); @@ -577,16 +577,11 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } case ARCH_GET_GS: { unsigned long base; - unsigned gsindex; - if (task->thread.gsindex == GS_TLS_SEL) + if (doit) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) { - savesegment(gs, gsindex); - if (gsindex) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gs; - } else + else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; -- GitLab From 7a5d67048745e3eab62779c6d043a2e3d95dc848 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:31:46 -0700 Subject: [PATCH 177/705] x86/cpu: Probe the behavior of nulling out a segment at boot time AMD and Intel do different things when writing zero to a segment selector. Since neither vendor documents the behavior well and it's easy to test the behavior, try nulling fs to see what happens. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/61588ba0e0df35beafd363dc8b68a4c5878ef095.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 31 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8f9afefd2dc5a..2a052302bc439 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -294,6 +294,7 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7fea4079d1020..8e40eee5843a3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -889,6 +889,35 @@ static void detect_nopl(struct cpuinfo_x86 *c) #endif } +static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + /* + * Empirically, writing zero to a segment selector on AMD does + * not clear the base, whereas writing zero to a segment + * selector on Intel does clear the base. Intel's behavior + * allows slightly faster context switches in the common case + * where GS is unused by the prev and next threads. + * + * Since neither vendor documents this anywhere that I can see, + * detect it directly instead of hardcoding the choice by + * vendor. + * + * I've designated AMD's behavior as the "bug" because it's + * counterintuitive and less friendly. + */ + + unsigned long old_base, tmp; + rdmsrl(MSR_FS_BASE, old_base); + wrmsrl(MSR_FS_BASE, 1); + loadsegment(fs, 0); + rdmsrl(MSR_FS_BASE, tmp); + if (tmp != 0) + set_cpu_bug(c, X86_BUG_NULL_SEG); + wrmsrl(MSR_FS_BASE, old_base); +#endif +} + static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; @@ -921,6 +950,8 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ detect_nopl(c); + + detect_null_seg_behavior(c); } static void x86_init_cache_qos(struct cpuinfo_x86 *c) -- GitLab From 3e2b68d752c9e09c40d76442aa94d3b8e421b0f1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:31:47 -0700 Subject: [PATCH 178/705] x86/asm, sched/x86: Rewrite the FS and GS context switch code The old code was incomprehensible and was buggy on AMD CPUs. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5f6bde874c6fe6831c6711b5b1522a238ba035b4.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 148 ++++++++++++++++++++++------------- 1 file changed, 93 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c671b9b015c0e..50337eac1ca2d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -282,7 +282,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned fsindex, gsindex; + unsigned prev_fsindex, prev_gsindex; fpu_switch_t fpu_switch; fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); @@ -292,8 +292,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, fsindex); - savesegment(gs, gsindex); + savesegment(fs, prev_fsindex); + savesegment(gs, prev_gsindex); /* * Load TLS before restoring any segments so that segment loads @@ -336,66 +336,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch FS and GS. * * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. Those bases - * only differ from the values in the GDT or LDT if the selector - * is 0. + * 64-bit bases are that controlled by arch_prctl. The bases + * don't necessarily match the selectors, as user code can do + * any number of things to cause them to be inconsistent. * - * Loading the segment register resets the hidden base part of - * the register to 0 or the value from the GDT / LDT. If the - * next base address zero, writing 0 to the segment register is - * much faster than using wrmsr to explicitly zero the base. + * We don't promise to preserve the bases if the selectors are + * nonzero. We also don't promise to preserve the base if the + * selector is zero and the base doesn't match whatever was + * most recently passed to ARCH_SET_FS/GS. (If/when the + * FSGSBASE instructions are enabled, we'll need to offer + * stronger guarantees.) * - * The thread_struct.fs and thread_struct.gs values are 0 - * if the fs and gs bases respectively are not overridden - * from the values implied by fsindex and gsindex. They - * are nonzero, and store the nonzero base addresses, if - * the bases are overridden. - * - * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should - * be impossible. - * - * Therefore we need to reload the segment registers if either - * the old or new selector is nonzero, and we need to override - * the base address if next thread expects it to be overridden. - * - * This code is unnecessarily slow in the case where the old and - * new indexes are zero and the new base is nonzero -- it will - * unnecessarily write 0 to the selector before writing the new - * base address. - * - * Note: This all depends on arch_prctl being the only way that - * user code can override the segment base. Once wrfsbase and - * wrgsbase are enabled, most of this code will need to change. + * As an invariant, + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) is + * impossible. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { + if (next->fsindex) { + /* Loading a nonzero value into FS sets the index and base. */ loadsegment(fs, next->fsindex); - - /* - * If user code wrote a nonzero value to FS, then it also - * cleared the overridden base address. - * - * XXX: if user code wrote 0 to FS and cleared the base - * address itself, we won't notice and we'll incorrectly - * restore the prior base address next time we reschdule - * the process. - */ - if (fsindex) - prev->fs = 0; + } else { + if (next->fs) { + /* Next index is zero but next base is nonzero. */ + if (prev_fsindex) + loadsegment(fs, 0); + wrmsrl(MSR_FS_BASE, next->fs); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + */ + loadsegment(fs, __USER_DS); + loadsegment(fs, 0); + } else { + /* + * If the previous index is zero and ARCH_SET_FS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->fs || prev_fsindex) + loadsegment(fs, 0); + } + } } - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_fsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_fsindex) + prev->fs = 0; + prev->fsindex = prev_fsindex; - if (unlikely(gsindex | next->gsindex | prev->gs)) { + if (next->gsindex) { + /* Loading a nonzero value into GS sets the index and base. */ load_gs_index(next->gsindex); - - /* This works (and fails) the same way as fsindex above. */ - if (gsindex) - prev->gs = 0; + } else { + if (next->gs) { + /* Next index is zero but next base is nonzero. */ + if (prev_gsindex) + load_gs_index(0); + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + * + * This contains a pointless SWAPGS pair. + * Fixing it would involve an explicit check + * for Xen or a new pvop. + */ + load_gs_index(__USER_DS); + load_gs_index(0); + } else { + /* + * If the previous index is zero and ARCH_SET_GS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->gs || prev_gsindex) + load_gs_index(0); + } + } } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_gsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_gsindex) + prev->gs = 0; + prev->gsindex = prev_gsindex; switch_fpu_finish(next_fpu, fpu_switch); -- GitLab From 0230bb038fa99af0c425fc4cffed307e545a9642 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:31:48 -0700 Subject: [PATCH 179/705] x86/cpu: Move X86_BUG_ESPFIX initialization to generic_identify() It was in detect_nopl(), which was either a mistake by me or some kind of mis-merge. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Fixes: ff236456f072 ("x86/cpu: Move X86_BUG_ESPFIX initialization to generic_identify") Link: http://lkml.kernel.org/r/0949337f13660461edca08ab67d1a841441289c9.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 50 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e40eee5843a3..28d3255edf000 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -861,31 +861,6 @@ static void detect_nopl(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_NOPL); #else set_cpu_cap(c, X86_FEATURE_NOPL); -#endif - - /* - * ESPFIX is a strange bug. All real CPUs have it. Paravirt - * systems that run Linux at CPL > 0 may or may not have the - * issue, but, even if they have the issue, there's absolutely - * nothing we can do about it because we can't use the real IRET - * instruction. - * - * NB: For the time being, only 32-bit kernels support - * X86_BUG_ESPFIX as such. 64-bit kernels directly choose - * whether to apply espfix using paravirt hooks. If any - * non-paravirt system ever shows up that does *not* have the - * ESPFIX issue, we can change this. - */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_PARAVIRT - do { - extern void native_iret(void); - if (pv_cpu_ops.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -#else - set_cpu_bug(c, X86_BUG_ESPFIX); -#endif #endif } @@ -952,6 +927,31 @@ static void generic_identify(struct cpuinfo_x86 *c) detect_nopl(c); detect_null_seg_behavior(c); + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +# ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +# else + set_cpu_bug(c, X86_BUG_ESPFIX); +# endif +#endif } static void x86_init_cache_qos(struct cpuinfo_x86 *c) -- GitLab From 96e5d28ae7a5250f3deb2434f1895c9daf48b1bd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Apr 2016 17:31:49 -0700 Subject: [PATCH 180/705] x86/cpu: Add Erratum 88 detection on AMD Erratum 88 affects old AMD K8s, where a SWAPGS fails to cause an input dependency on GS. Therefore, we need to MFENCE before it. But that MFENCE is expensive and unnecessary on the remaining x86 CPUs out there so patch it out on the CPUs which don't require it. Signed-off-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aec6b2df1bfc56101d4e9e2e5d5d570bf41663c6.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/amd.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 858b555e274b8..64d2033d1e49d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -783,7 +783,7 @@ ENTRY(native_load_gs_index) SWAPGS gs_change: movl %edi, %gs -2: mfence /* workaround */ +2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq ret diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2a052302bc439..7bfb6b70c7452 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -295,6 +295,8 @@ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ + #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6e47e3a916f12..b7cc9efe08b5c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,6 +632,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ msr_set_bit(MSR_K7_HWCR, 6); #endif + set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); } static void init_amd_gh(struct cpuinfo_x86 *c) -- GitLab From 42c748bb2544f21c3d115240527fe4478e193641 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Apr 2016 17:31:50 -0700 Subject: [PATCH 181/705] x86/entry/64: Make gs_change a local label ... so that it doesn't appear in objdump output. Signed-off-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rudolf Marek Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b9c532a0e5f8d56dede2bd59767d40024d5a75e2.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 64d2033d1e49d..1693c17dbf814 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -781,7 +781,7 @@ ENTRY(native_load_gs_index) pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS -gs_change: +.Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS @@ -789,7 +789,7 @@ gs_change: ret END(native_load_gs_index) - _ASM_EXTABLE(gs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, bad_gs) .section .fixup, "ax" /* running with kernelgs */ bad_gs: @@ -1019,13 +1019,13 @@ ENTRY(error_entry) movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) je .Lbstep_iret - cmpq $gs_change, RIP+8(%rsp) + cmpq $.Lgs_change, RIP+8(%rsp) jne .Lerror_entry_done /* - * hack: gs_change can fail with user gsbase. If this happens, fix up + * hack: .Lgs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in - * gs_change's error handler with kernel gsbase. + * .Lgs_change's error handler with kernel gsbase. */ jmp .Lerror_entry_from_usermode_swapgs -- GitLab From 1ed95e52d902035e39a715ff3a314a893a96e5b7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 7 Apr 2016 17:16:59 -0700 Subject: [PATCH 182/705] x86/vdso: Remove direct HPET access through the vDSO Allowing user code to map the HPET is problematic. HPET implementations are notoriously buggy, and there are probably many machines on which even MMIO reads from bogus HPET addresses are problematic. We have a report that the Dell Precision M2800 with: ACPI: HPET 0x00000000C8FE6238 000038 (v01 DELL CBX3 01072009 AMI. 00000005) is either so slow when accessing the HPET or actually hangs in some regard, causing soft lockups to be reported if users do unexpected things to the HPET. The vclock HPET code has also always been a questionable speedup. Accessing an HPET is exceedingly slow (on the order of several microseconds), so the added overhead in requiring a syscall to read the HPET is a small fraction of the total code of accessing it. To avoid future problems, let's just delete the code entirely. In the long run, this could actually be a speedup. Waiman Long as a patch to optimize the case where multiple CPUs contend for the HPET, but that won't help unless all the accesses are mediated by the kernel. Reported-by: Rasmus Villemoes Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Waiman Long Cc: Waiman Long Link: http://lkml.kernel.org/r/d2f90bba98db9905041cff294646d290d378f67a.1460074438.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c | 15 --------------- arch/x86/entry/vdso/vdso-layout.lds.S | 5 ++--- arch/x86/entry/vdso/vma.c | 11 ----------- arch/x86/include/asm/clocksource.h | 9 ++++----- arch/x86/kernel/hpet.c | 1 - arch/x86/kvm/trace.h | 3 +-- 6 files changed, 7 insertions(+), 37 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 03c3eb77bfceb..2f02d23a05ef4 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -28,16 +27,6 @@ extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); -#ifdef CONFIG_HPET_TIMER -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -static notrace cycle_t vread_hpet(void) -{ - return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); -} -#endif - #ifdef CONFIG_PARAVIRT_CLOCK extern u8 pvclock_page __attribute__((visibility("hidden"))); @@ -195,10 +184,6 @@ notrace static inline u64 vgetsns(int *mode) if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); -#ifdef CONFIG_HPET_TIMER - else if (gtod->vclock_mode == VCLOCK_HPET) - cycles = vread_hpet(); -#endif #ifdef CONFIG_PARAVIRT_CLOCK else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4158acc17df07..a708aa90b507f 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 3 * PAGE_SIZE; + vvar_start = . - 2 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -35,8 +35,7 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - hpet_page = vvar_start + PAGE_SIZE; - pvclock_page = vvar_start + 2 * PAGE_SIZE; + pvclock_page = vvar_start + PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 10f7045849226..b3cf81333a54e 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -129,16 +128,6 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (sym_offset == image->sym_vvar_page) { ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); - } else if (sym_offset == image->sym_hpet_page) { -#ifdef CONFIG_HPET_TIMER - if (hpet_address && vclock_was_used(VCLOCK_HPET)) { - ret = vm_insert_pfn_prot( - vma, - (unsigned long)vmf->virtual_address, - hpet_address >> PAGE_SHIFT, - pgprot_noncached(PAGE_READONLY)); - } -#endif } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index d194266acb28e..eae33c7170c86 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,11 +3,10 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ -#define VCLOCK_MAX 3 +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 2 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a1f0e4a5c47e3..7282c2e3858ec 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -773,7 +773,6 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, - .archdata = { .vclock_mode = VCLOCK_HPET }, }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 2f1ea2f61e1fc..b72743c5668d3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -809,8 +809,7 @@ TRACE_EVENT(kvm_write_tsc_offset, #define host_clocks \ {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"}, \ - {VCLOCK_HPET, "hpet"} \ + {VCLOCK_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), -- GitLab From 482dd2ef124484601adea82e5e806e81e2bc5521 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 22:43:59 +0200 Subject: [PATCH 183/705] x86/syscalls: Wire up compat readv2/writev2 syscalls Reported-by: David Smith Tested-by: David Smith Signed-off-by: Christoph Hellwig Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160407204359.GA3720@lst.de Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/syscall_32.tbl | 4 ++-- arch/x86/entry/syscalls/syscall_64.tbl | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index b30dd8154cc24..4cddd17153fbe 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -384,5 +384,5 @@ 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 377 i386 copy_file_range sys_copy_file_range -378 i386 preadv2 sys_preadv2 -379 i386 pwritev2 sys_pwritev2 +378 i386 preadv2 sys_preadv2 compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cac6d17ce5db0..555263e385c92 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,3 +374,5 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat/ptregs +534 x32 preadv2 compat_sys_preadv2 +535 x32 pwritev2 compat_sys_pwritev2 -- GitLab From f8e04d854506ddfeba9cb41b601972b28521f104 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:21 +0200 Subject: [PATCH 184/705] locking/rwsem: Get rid of __down_write_nested() This is no longer used anywhere and all callers (__down_write()) use 0 as a subclass. Ditch __down_write_nested() to make the code easier to follow. This shouldn't introduce any functional change. Signed-off-by: Michal Hocko Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-2-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/s390/include/asm/rwsem.h | 7 +------ arch/sh/include/asm/rwsem.h | 5 ----- arch/sparc/include/asm/rwsem.h | 7 +------ arch/x86/include/asm/rwsem.h | 7 +------ include/asm-generic/rwsem.h | 7 +------ include/linux/rwsem-spinlock.h | 1 - kernel/locking/rwsem-spinlock.c | 7 +------ 7 files changed, 5 insertions(+), 36 deletions(-) diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index fead491dfc285..555d23b6b6d13 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -108,11 +108,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - /* * trylock for writing -- returns 1 if successful, 0 if contention */ diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index edab572652939..a5104bebd1eb3 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -114,11 +114,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - __down_write(sem); -} - /* * implement exchange and add functionality */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index 069bf4d663a11..e5a0d575bc7f9 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -45,7 +45,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { long tmp; @@ -55,11 +55,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ceec86eb68e96..4a8292a0d6e16 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -99,7 +99,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { long tmp; asm volatile("# beginning down_write\n\t" @@ -116,11 +116,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "memory", "cc"); } -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - /* * trylock for writing -- returns 1 if successful, 0 if contention */ diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index d6d5dc98d7da5..b8d8a6cf4ca82 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { long tmp; @@ -63,11 +63,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index 561e8615528d4..a733a5467e6c1 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -34,7 +34,6 @@ struct rw_semaphore { extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); extern int __down_write_trylock(struct rw_semaphore *sem); extern void __up_read(struct rw_semaphore *sem); extern void __up_write(struct rw_semaphore *sem); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a50485720653..bab26104a5d0e 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,7 +191,7 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +void __sched __down_write(struct rw_semaphore *sem) { struct rwsem_waiter waiter; struct task_struct *tsk; @@ -227,11 +227,6 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - /* * trylock for writing -- returns 1 if successful, 0 if contention */ -- GitLab From 2e927c6422fea5ce36b24b00c2c84f2e9ead31b6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:22 +0200 Subject: [PATCH 185/705] locking/rwsem: Drop explicit memory barriers sh and xtensa seem to be the only architectures which use explicit memory barriers for rw_semaphore operations even though they are not really needed because there is the full memory barrier is always implied by atomic_{inc,dec,add,sub}_return() resp. cmpxchg(). Remove them. Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-3-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/sh/include/asm/rwsem.h | 14 ++------------ arch/xtensa/include/asm/rwsem.h | 14 ++------------ 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index a5104bebd1eb3..f6c951c7a875b 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -24,9 +24,7 @@ */ static inline void __down_read(struct rw_semaphore *sem) { - if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else + if (atomic_inc_return((atomic_t *)(&sem->count)) <= 0) rwsem_down_read_failed(sem); } @@ -37,7 +35,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) while ((tmp = sem->count) >= 0) { if (tmp == cmpxchg(&sem->count, tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); return 1; } } @@ -53,9 +50,7 @@ static inline void __down_write(struct rw_semaphore *sem) tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else + if (tmp != RWSEM_ACTIVE_WRITE_BIAS) rwsem_down_write_failed(sem); } @@ -65,7 +60,6 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); return tmp == RWSEM_UNLOCKED_VALUE; } @@ -76,7 +70,6 @@ static inline void __up_read(struct rw_semaphore *sem) { int tmp; - smp_wmb(); tmp = atomic_dec_return((atomic_t *)(&sem->count)); if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -87,7 +80,6 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0) rwsem_wake(sem); @@ -108,7 +100,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) { int tmp; - smp_wmb(); tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); if (tmp < 0) rwsem_downgrade_wake(sem); @@ -119,7 +110,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) */ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) { - smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index 249619e7e7f2a..593483f6e1ff5 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -29,9 +29,7 @@ */ static inline void __down_read(struct rw_semaphore *sem) { - if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else + if (atomic_add_return(1,(atomic_t *)(&sem->count)) <= 0) rwsem_down_read_failed(sem); } @@ -42,7 +40,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) while ((tmp = sem->count) >= 0) { if (tmp == cmpxchg(&sem->count, tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); return 1; } } @@ -58,9 +55,7 @@ static inline void __down_write(struct rw_semaphore *sem) tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else + if (tmp != RWSEM_ACTIVE_WRITE_BIAS) rwsem_down_write_failed(sem); } @@ -70,7 +65,6 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); return tmp == RWSEM_UNLOCKED_VALUE; } @@ -81,7 +75,6 @@ static inline void __up_read(struct rw_semaphore *sem) { int tmp; - smp_wmb(); tmp = atomic_sub_return(1,(atomic_t *)(&sem->count)); if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -92,7 +85,6 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - smp_wmb(); if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, (atomic_t *)(&sem->count)) < 0) rwsem_wake(sem); @@ -113,7 +105,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) { int tmp; - smp_wmb(); tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); if (tmp < 0) rwsem_downgrade_wake(sem); @@ -124,7 +115,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) */ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) { - smp_mb(); return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -- GitLab From 3aa2591dc2ea292d068dc3a263f1976806c2c281 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:23 +0200 Subject: [PATCH 186/705] locking/rwsem, xtensa: Drop superfluous arch specific implementation Since "locking, rwsem: drop explicit memory barriers" the arch specific code is basically same as the the generic one so we can drop the superfluous code. Suggested-by: Davidlohr Bueso Signed-off-by: Michal Hocko Acked-by: Max Filippov Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-4-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/xtensa/include/asm/Kbuild | 1 + arch/xtensa/include/asm/rwsem.h | 121 -------------------------------- 2 files changed, 1 insertion(+), 121 deletions(-) delete mode 100644 arch/xtensa/include/asm/rwsem.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index b56855a1382a3..28cf4c5d65efa 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sections.h generic-y += siginfo.h generic-y += statfs.h diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h deleted file mode 100644 index 593483f6e1ff5..0000000000000 --- a/arch/xtensa/include/asm/rwsem.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * include/asm-xtensa/rwsem.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Largely copied from include/asm-ppc/rwsem.h - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_RWSEM_H -#define _XTENSA_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include directly, use instead." -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_add_return(1,(atomic_t *)(&sem->count)) <= 0) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp != RWSEM_ACTIVE_WRITE_BIAS) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_sub_return(1,(atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* _XTENSA_RWSEM_H */ -- GitLab From e4a2b01ed3d1591437f93a42f6c4c039b60e0c0a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:24 +0200 Subject: [PATCH 187/705] locking/rwsem, sh: Drop superfluous arch specific implementation Since "locking, rwsem: drop explicit memory barriers" the arch specific code is basically same as the the generic one so we can drop the superfluous code. Suggested-by: Davidlohr Bueso Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-5-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/sh/include/asm/Kbuild | 1 + arch/sh/include/asm/rwsem.h | 117 ------------------------------------ 2 files changed, 1 insertion(+), 117 deletions(-) delete mode 100644 arch/sh/include/asm/rwsem.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index a319745a7b635..751c3373a92c8 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -26,6 +26,7 @@ generic-y += percpu.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h deleted file mode 100644 index f6c951c7a875b..0000000000000 --- a/arch/sh/include/asm/rwsem.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff - * in lib/rwsem.c. - */ - -#ifndef _ASM_SH_RWSEM_H -#define _ASM_SH_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_inc_return((atomic_t *)(&sem->count)) <= 0) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp != RWSEM_ACTIVE_WRITE_BIAS) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_dec_return((atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_SH_RWSEM_H */ -- GitLab From 938072e32ce13e5537ef001cdabcb8a6932b09a0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:25 +0200 Subject: [PATCH 188/705] locking/rwsem, sparc: Drop superfluous arch specific implementation sparc basically reuses the generic implementation of rwsem so we can reuse the code rather than duplicate it. Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-6-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/Kbuild | 1 + arch/sparc/include/asm/rwsem.h | 119 --------------------------------- 2 files changed, 1 insertion(+), 119 deletions(-) delete mode 100644 arch/sparc/include/asm/rwsem.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index e928618838bc5..6024c26c05856 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h deleted file mode 100644 index e5a0d575bc7f9..0000000000000 --- a/arch/sparc/include/asm/rwsem.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * rwsem.h: R/W semaphores implemented using CAS - * - * Written by David S. Miller (davem@redhat.com), 2001. - * Derived from asm-i386/rwsem.h - */ -#ifndef _SPARC64_RWSEM_H -#define _SPARC64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_ACTIVE_MASK 0xffffffffL -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L)) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long tmp; - - while ((tmp = sem->count) >= 0L) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_dec_return((atomic64_t *)(&sem->count)); - if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)) < 0L)) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - atomic64_add(delta, (atomic64_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count)); - if (tmp < 0L) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ - -#endif /* _SPARC64_RWSEM_H */ -- GitLab From d47996082f52baa0ca8b48d26b3cbef5ede70a73 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:26 +0200 Subject: [PATCH 189/705] locking/rwsem: Introduce basis for down_write_killable() Introduce a generic implementation necessary for down_write_killable(). This is a trivial extension of the already existing down_write() call which can be interrupted by SIGKILL. This patch doesn't provide down_write_killable() yet because arches have to provide the necessary pieces before. rwsem_down_write_failed() which is a generic slow path for the write lock is extended to take a task state and renamed to __rwsem_down_write_failed_common(). The return value is either a valid semaphore pointer or ERR_PTR(-EINTR). rwsem_down_write_failed_killable() is exported as a new way to wait for the lock and be killable. For rwsem-spinlock implementation the current __down_write() it updated in a similar way as __rwsem_down_write_failed_common() except it doesn't need new exports just visible __down_write_killable(). Architectures which are not using the generic rwsem implementation are supposed to provide their __down_write_killable() implementation and use rwsem_down_write_failed_killable() for the slow path. Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-7-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/rwsem.h | 12 ++++++++++++ include/linux/rwsem-spinlock.h | 1 + include/linux/rwsem.h | 2 ++ kernel/locking/rwsem-spinlock.c | 22 ++++++++++++++++++++-- kernel/locking/rwsem-xadd.c | 31 +++++++++++++++++++++++++------ 5 files changed, 60 insertions(+), 8 deletions(-) diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index b8d8a6cf4ca82..3fc94a046bf58 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -63,6 +63,18 @@ static inline void __down_write(struct rw_semaphore *sem) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_long_t *)&sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + return 0; +} + static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index a733a5467e6c1..ae0528b834cd3 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -34,6 +34,7 @@ struct rw_semaphore { extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); +extern int __must_check __down_write_killable(struct rw_semaphore *sem); extern int __down_write_trylock(struct rw_semaphore *sem); extern void __up_read(struct rw_semaphore *sem); extern void __up_write(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8f498cdde2802..7d7ae029dac5a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include #endif @@ -43,6 +44,7 @@ struct rw_semaphore { extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index bab26104a5d0e..1591f6b3539fd 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write(struct rw_semaphore *sem) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,16 +216,33 @@ void __sched __down_write(struct rw_semaphore *sem) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d16..df4dcb883b505 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +487,39 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + ret = ERR_PTR(-EINTR); + goto out; + } schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } +out: __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - return sem; + return ret; +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here -- GitLab From 7deb5eebc1e61b15c5c7f1ef19f216b20d7f7d00 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:27 +0200 Subject: [PATCH 190/705] locking/rwsem, alpha: Provide __down_write_killable() Introduce ___down_write() for the fast path and reuse it for __down_write() resp. __down_write_killable() each using the respective generic slow path (rwsem_down_write_failed() resp. rwsem_down_write_failed_killable()). Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-8-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/rwsem.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index a83bbea62c674..0131a7058778e 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem) :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); #endif - if (unlikely(oldcount)) + return oldcount; +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for writing -- returns 1 if successful, 0 if contention */ -- GitLab From a02137eb5177e7afc8dfa52a2888c1f2f4840739 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:28 +0200 Subject: [PATCH 191/705] locking/rwsem, ia64: Provide __down_write_killable() Introduce ___down_write() for the fast path and reuse it for __down_write() resp. __down_write_killable() each using the respective generic slow path (rwsem_down_write_failed() resp. rwsem_down_write_failed_killable()). Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-9-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/rwsem.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index ce112472bdd65..8b23e070b8440 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem) /* * lock for writing */ -static inline void -__down_write (struct rw_semaphore *sem) +static inline long +___down_write (struct rw_semaphore *sem) { long old, new; @@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem) new = old + RWSEM_ACTIVE_WRITE_BIAS; } while (cmpxchg_acq(&sem->count, old, new) != old); - if (old != 0) + return old; +} + +static inline void +__down_write (struct rw_semaphore *sem) +{ + if (___down_write(sem)) rwsem_down_write_failed(sem); } +static inline int +__down_write_killable (struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * unlock after reading */ -- GitLab From 4edab14ec66fae5b3c7c4969295facf70365f39d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:29 +0200 Subject: [PATCH 192/705] locking/rwsem, s390: Provide __down_write_killable() Introduce ___down_write() for the fast path and reuse it for __down_write() resp. __down_write_killable() each using the respective generic slow path (rwsem_down_write_failed() resp. rwsem_down_write_failed_killable()). Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-10-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/s390/include/asm/rwsem.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 555d23b6b6d13..c75e4471e6188 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +static inline long ___down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -104,10 +104,25 @@ static inline void __down_write(struct rw_semaphore *sem) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "m" (tmp) : "cc", "memory"); - if (old != 0) + + return old; +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + if (___down_write(sem)) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for writing -- returns 1 if successful, 0 if contention */ -- GitLab From 664b4e24c6145830885e854195376351b0eb3eee Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:30 +0200 Subject: [PATCH 193/705] locking/rwsem, x86: Provide __down_write_killable() which uses the same fast path as __down_write() except it falls back to call_rwsem_down_write_failed_killable() slow path and return -EINTR if killed. To prevent from code duplication extract the skeleton of __down_write() into a helper macro which just takes the semaphore and the slow path function to be called. Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-11-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/rwsem.h | 41 ++++++++++++++++++++++++------------ arch/x86/lib/rwsem.S | 8 +++++++ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 4a8292a0d6e16..d759c5f70f497 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -99,21 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ +#define ____down_write(sem, slow_path) \ +({ \ + long tmp; \ + struct rw_semaphore* ret = sem; \ + asm volatile("# beginning down_write\n\t" \ + LOCK_PREFIX " xadd %1,(%2)\n\t" \ + /* adds 0xffff0001, returns the old value */ \ + " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ + /* was the active mask 0 before? */\ + " jz 1f\n" \ + " call " slow_path "\n" \ + "1:\n" \ + "# ending down_write" \ + : "+m" (sem->count), "=d" (tmp), "+a" (ret) \ + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : "memory", "cc"); \ + ret; \ +}) + static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" - /* adds 0xffff0001, returns the old value */ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jz 1f\n" - " call call_rwsem_down_write_failed\n" - "1:\n" - "# ending down_write" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + ____down_write(sem, "call_rwsem_down_write_failed"); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) + return -EINTR; + + return 0; } /* diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index be110efa00966..4534a7e912f31 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -106,6 +106,14 @@ ENTRY(call_rwsem_down_write_failed) ret ENDPROC(call_rwsem_down_write_failed) +ENTRY(call_rwsem_down_write_failed_killable) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed_killable + restore_common_regs + ret +ENDPROC(call_rwsem_down_write_failed_killable) + ENTRY(call_rwsem_wake) FRAME_BEGIN /* do nothing if still outstanding active readers */ -- GitLab From e465de1cd5e1759e40f077bac287de60d56ad06c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Apr 2016 17:35:07 +0300 Subject: [PATCH 194/705] perf/x86/intel/pt: Use boot_cpu_has() because it's there At the moment, initialization path is using test_cpu_cap(&boot_cpu_data), to detect PT, which is just open coding boot_cpu_has(). Use the latter instead. Signed-off-by: Alexander Shishkin Acked-by: Borislav Petkov Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/1459953307-14372-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 127f58c179767..1aefd430e7522 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1106,7 +1106,7 @@ static __init int pt_init(void) BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) return -ENODEV; get_online_cpus(); -- GitLab From 69385f8879344f4a1f078f761bd3523fcf697131 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 6 Apr 2016 10:05:14 +0200 Subject: [PATCH 195/705] x86/RAS: Rename AMD MCE injector config item ... to be the same like the file name of injection module itself to avoid confusion when grepping. No functionality change. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1459929916-12852-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/Kconfig | 2 +- arch/x86/ras/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index df280da348255..d957d5f21a865 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,4 +1,4 @@ -config AMD_MCE_INJ +config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile index dd2c98b84037b..5f94546db280c 100644 --- a/arch/x86/ras/Makefile +++ b/arch/x86/ras/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o +obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o -- GitLab From bf92b1feb658f6a262daf3a87d790997a1dca0ff Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Apr 2016 10:05:15 +0200 Subject: [PATCH 196/705] x86/mce: Remove explicit smp_rmb() when starting CPUs sync mce_start() has an explicit smp_wmb() to serialize writes to global_nwo and mce_callin. However, atomic_inc_return() implies barriers on both sides of the call, as such simply rely on this full SMP barrier. Signed-off-by: Davidlohr Bueso Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1458602396-840-1-git-send-email-dave@stgolabs.net Link: http://lkml.kernel.org/r/1459929916-12852-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f0c921b03e424..6b7039c166b84 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -830,9 +830,9 @@ static int mce_start(int *no_way_out) atomic_add(*no_way_out, &global_nwo); /* - * global_nwo should be updated before mce_callin + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. */ - smp_wmb(); order = atomic_inc_return(&mce_callin); /* -- GitLab From fb90a6e93c0684ab2629a42462400603aa829b9c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Mon, 4 Apr 2016 15:42:02 +0200 Subject: [PATCH 197/705] sched/debug: Don't dump sched debug info in SysRq-W sysrq_sched_debug_show() can dump a lot of information. Don't print out all that if we're just trying to get a list of blocked tasks (SysRq-W). The information is still accessible with SysRq-T. Signed-off-by: Rabin Vincent Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459777322-30902-1-git-send-email-rabin.vincent@axis.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 11594230ef4de..06efbb9c95441 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5050,7 +5050,8 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* -- GitLab From 1886297ce0c8d563a08c8a8c4c0b97743e06cd37 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 Apr 2016 13:36:00 -0600 Subject: [PATCH 198/705] x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386 The following BUG_ON() crash was reported on QEMU/i386: kernel BUG at arch/x86/mm/physaddr.c:79! Call Trace: phys_mem_access_prot_allowed mmap_mem ? mmap_region mmap_region do_mmap vm_mmap_pgoff SyS_mmap_pgoff do_int80_syscall_32 entry_INT80_32 after commit: edfe63ec97ed ("x86/mtrr: Fix Xorg crashes in Qemu sessions") PAT is now set to disabled state when MTRRs are disabled. Thus, reactivating the __pa(high_memory) check in phys_mem_access_prot_allowed(). When CONFIG_DEBUG_VIRTUAL is set, __pa() calls __phys_addr(), which in turn calls slow_virt_to_phys() for 'high_memory'. Because 'high_memory' is set to (the max direct mapped virt addr + 1), it is not a valid virtual address. Hence, slow_virt_to_phys() returns 0 and hit the BUG_ON. Using __pa_nodebug() instead of __pa() will fix this BUG_ON. However, this code block, originally written for Pentiums and earlier, is no longer adequate since a 32-bit Xen guest has MTRRs disabled and supports ZONE_HIGHMEM. In this setup, this code sets UC attribute for accessing RAM in high memory range. Delete this code block as it has been unused for a long time. Reported-by: kernel test robot Reviewed-by: Borislav Petkov Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: David Vrabel Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1460403360-25441-1-git-send-email-toshi.kani@hpe.com Link: https://lkml.org/lkml/2016/4/1/608 Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c4c3ddcc9069d..fb0604f11eec2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -778,25 +778,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; -- GitLab From abcfdfe07de75f830cbec1aa3eb17833a0166697 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:54 +0200 Subject: [PATCH 199/705] x86/cpufeature: Replace cpu_has_avx2 with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/1459801503-15600-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 2 +- arch/x86/crypto/chacha20_glue.c | 2 +- arch/x86/crypto/poly1305_glue.c | 2 +- arch/x86/crypto/serpent_avx2_glue.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index c37f7028c85ae..39389662e29bf 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,7 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || + if (!boot_cpu_has(X86_FEATURE_AVX2) || !cpu_has_avx || !cpu_has_aes || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 8baaff5af0b57..cea061e137da6 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -129,7 +129,7 @@ static int __init chacha20_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + chacha20_use_avx2 = cpu_has_avx && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index b283868acdf85..ea21d2e440f7f 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -183,7 +183,7 @@ static int __init poly1305_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + poly1305_use_avx2 = cpu_has_avx && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 408cae2b35438..870f6d812a2dd 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -538,7 +538,7 @@ static int __init init(void) { const char *feature_name; - if (!cpu_has_avx2 || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index c594e04bf529c..810166530cbfe 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,7 +125,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) -#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* -- GitLab From 1f4dd7938ea575a2d1972e180eaef31e6edb1808 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:55 +0200 Subject: [PATCH 200/705] x86/cpufeature: Replace cpu_has_aes with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/1459801503-15600-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 3 ++- arch/x86/crypto/camellia_aesni_avx_glue.c | 4 +++- arch/x86/include/asm/cpufeature.h | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 39389662e29bf..c07f699826a0a 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,8 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !cpu_has_avx || !cpu_has_aes || + if (!boot_cpu_has(X86_FEATURE_AVX2) || !cpu_has_avx || + !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 65f64556725b2..6d256d59c5fd0 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,9 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + if (!cpu_has_avx || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 810166530cbfe..a6627b30bf45d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -123,7 +123,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) -#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) -- GitLab From da154e82af4d0c63e2334d5b3822426600b0490f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:56 +0200 Subject: [PATCH 201/705] x86/cpufeature: Replace cpu_has_avx with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/1459801503-15600-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/crypto/aesni-intel_glue.c | 2 +- arch/x86/crypto/camellia_aesni_avx2_glue.c | 3 ++- arch/x86/crypto/camellia_aesni_avx_glue.c | 2 +- arch/x86/crypto/chacha20_glue.c | 3 ++- arch/x86/crypto/poly1305_glue.c | 3 ++- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/xor_avx.h | 4 ++-- 10 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 064c7e2bd7c8e..5b7fa14710073 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1477,7 +1477,7 @@ static int __init aesni_init(void) } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX - if (cpu_has_avx) { + if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; pr_info("AES CTR mode by8 optimization enabled\n"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index c07f699826a0a..60907c139c4e2 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,8 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !cpu_has_avx || + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 6d256d59c5fd0..d96429da88eb8 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,7 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx || + if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index cea061e137da6..2d5c2e0bd939b 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -129,7 +129,8 @@ static int __init chacha20_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = cpu_has_avx && boot_cpu_has(X86_FEATURE_AVX2) && + chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index ea21d2e440f7f..e32142bc071d9 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -183,7 +183,8 @@ static int __init poly1305_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = cpu_has_avx && boot_cpu_has(X86_FEATURE_AVX2) && + poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index dd14616b77397..1024e378a358f 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -166,7 +166,7 @@ static struct shash_alg sha1_avx_alg = { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 5f4d6086dc591..3ae0f43ebd376 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -201,7 +201,7 @@ static struct shash_alg sha256_avx_algs[] = { { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 34e5083d6f365..0b17c83d027dd 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -151,7 +151,7 @@ asmlinkage void sha512_transform_avx(u64 *digest, const char *data, static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a6627b30bf45d..3b232a120a5d8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -123,7 +123,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) -#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index e45e556140af9..22a7b1870a318 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) + (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) #else -- GitLab From dda9edf7c1fdc0d7a7ed7f46299a26282190fb6d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:57 +0200 Subject: [PATCH 202/705] x86/cpufeature: Replace cpu_has_xmm with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459801503-15600-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/xor_32.h | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/init.c | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3b232a120a5d8..6463258b4619f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -122,7 +122,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) -#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index c54beb44c4c1f..635eac5439229 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -550,7 +550,7 @@ static struct xor_block_template xor_block_pIII_sse = { #define XOR_TRY_TEMPLATES \ do { \ AVX_XOR_SPEED; \ - if (cpu_has_xmm) { \ + if (boot_cpu_has(X86_FEATURE_XMM)) { \ xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_sse_pf64); \ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8e37cc8a539ad..b05aa68f88c0f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -526,7 +526,7 @@ static inline unsigned short get_fpu_swd(struct fpu *fpu) static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) { - if (cpu_has_xmm) { + if (boot_cpu_has(X86_FEATURE_XMM)) { return fpu->state.fxsave.mxcsr; } else { return MXCSR_DEFAULT; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 54c86fffbf9f8..9bbb332a71ff1 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -31,7 +31,7 @@ static void fpu__init_cpu_generic(void) if (cpu_has_fxsr) cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) + if (boot_cpu_has(X86_FEATURE_XMM)) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) cr4_set_bits(cr4_mask); -- GitLab From a402a8dffc9f838b413c5ee0317d2d3184968f5b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:58 +0200 Subject: [PATCH 203/705] x86/cpufeature: Replace cpu_has_fpu with boot_cpu_has() usage Use static_cpu_has() in the timing-sensitive paths in fpstate_init() and fpu__copy(). While at it, simplify the use in init_cyrix() and get rid of the ternary operator. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459801503-15600-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/cpu/cyrix.c | 2 +- arch/x86/kernel/fpu/bugs.c | 2 +- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/init.c | 8 ++++---- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6463258b4619f..b23d5570a5f42 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,7 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6adef9cac23ee..bd9dcd6b712d0 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -333,7 +333,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) switch (dir0_lsn) { case 0xd: /* either a 486SLC or DLC w/o DEVID */ dir0_msn = 0; - p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; + p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)]; break; case 0xe: /* a 486S A step */ diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index dd9ca9b60ff3a..224b5ec521959 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -66,6 +66,6 @@ void __init fpu__init_check_bugs(void) * kernel_fpu_begin/end() in check_fpu() relies on the patched * alternative instructions. */ - if (cpu_has_fpu) + if (boot_cpu_has(X86_FEATURE_FPU)) check_fpu(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b05aa68f88c0f..0e7859f9aedc6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -217,7 +217,7 @@ static inline void fpstate_init_fstate(struct fregs_state *fp) void fpstate_init(union fpregs_state *state) { - if (!cpu_has_fpu) { + if (!static_cpu_has(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } @@ -237,7 +237,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !cpu_has_fpu) + if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 9bbb332a71ff1..3a84275f012e8 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,13 +38,13 @@ static void fpu__init_cpu_generic(void) cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) cr0 |= X86_CR0_EM; write_cr0(cr0); /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) fpstate_init_soft(¤t->thread.fpu.state.soft); else #endif @@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) } #ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); @@ -212,7 +212,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * fpu__init_system_xstate(). */ - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { /* * Disable xsave as we do not support it if i387 * emulation is enabled. -- GitLab From 59e21e3d00e6bc23186763c3e0bf11baf8924124 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:24:59 +0200 Subject: [PATCH 204/705] x86/cpufeature: Replace cpu_has_tsc with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Torokhov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Sailer Link: http://lkml.kernel.org/r/1459801503-15600-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/tsc.h | 2 +- arch/x86/kernel/apic/apic.c | 10 +++++----- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/tsc.c | 12 ++++++------ drivers/input/joystick/analog.c | 6 +++--- drivers/net/hamradio/baycom_epp.c | 8 ++++---- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index b23d5570a5f42..8f58cd215f6d4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,7 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 174c4212780af..7428697c5b8df 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -22,7 +22,7 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { #ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d7867c885bf8a..0b6509f1a4fe9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -607,7 +607,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) long tapic = apic_read(APIC_TMCCT); unsigned long pm = acpi_pm_read_early(); - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); switch (lapic_cal_loops++) { @@ -668,7 +668,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) *delta = (long)res; /* Correct the tsc counter value */ - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " @@ -760,7 +760,7 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", lapic_timer_frequency); - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), @@ -1227,7 +1227,7 @@ void setup_local_APIC(void) unsigned long long tsc = 0, ntsc; long long max_loops = cpu_khz ? cpu_khz : 1000000; - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); if (disable_apic) { @@ -1311,7 +1311,7 @@ void setup_local_APIC(void) break; } if (queued) { - if (cpu_has_tsc && cpu_khz) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { ntsc = rdtsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 28d3255edf000..6bfa36de6d9f6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1558,7 +1558,7 @@ void cpu_init(void) pr_info("Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || - cpu_has_tsc || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c9c4c7ce3eb23..a0346bc518335 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -36,7 +36,7 @@ static int __read_mostly tsc_unstable; /* native_sched_clock() is called before tsc_init(), so we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ + erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ static int __read_mostly tsc_disabled = -1; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -834,7 +834,7 @@ int recalibrate_cpu_khz(void) #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = @@ -956,7 +956,7 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; @@ -1081,7 +1081,7 @@ static void __init check_system_tsc_reliable(void) */ int unsynchronized_tsc(void) { - if (!cpu_has_tsc || tsc_unstable) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP @@ -1205,7 +1205,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int __init init_tsc_clocksource(void) { - if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; if (tsc_clocksource_reliable) @@ -1242,7 +1242,7 @@ void __init tsc_init(void) u64 lpj; int cpu; - if (!cpu_has_tsc) { + if (!boot_cpu_has(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 6f8b084e13d07..3d8ff09eba576 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -143,9 +143,9 @@ struct analog_port { #include -#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) -#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) -#define TIME_NAME (cpu_has_tsc?"TSC":"PIT") +#define GET_TIME(x) do { if (boot_cpu_has(X86_FEATURE_TSC)) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) +#define DELTA(x,y) (boot_cpu_has(X86_FEATURE_TSC) ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) +#define TIME_NAME (boot_cpu_has(X86_FEATURE_TSC)?"TSC":"PIT") static unsigned int get_time_pit(void) { unsigned long flags; diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 72c9f1f352b4e..7c7830722ea2c 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -635,10 +635,10 @@ static int receive(struct net_device *dev, int cnt) #ifdef __i386__ #include -#define GETTICK(x) \ -({ \ - if (cpu_has_tsc) \ - x = (unsigned int)rdtsc(); \ +#define GETTICK(x) \ +({ \ + if (boot_cpu_has(X86_FEATURE_TSC)) \ + x = (unsigned int)rdtsc(); \ }) #else /* __i386__ */ #define GETTICK(x) -- GitLab From 93984fbd4e33cc861d5b49caed02a02cbfb01340 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:25:00 +0200 Subject: [PATCH 205/705] x86/cpufeature: Replace cpu_has_apic with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: iommu@lists.linux-foundation.org Cc: linux-pm@vger.kernel.org Cc: oprofile-list@lists.sf.net Link: http://lkml.kernel.org/r/1459801503-15600-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/irq_work.h | 2 +- arch/x86/kernel/acpi/boot.c | 8 ++++---- arch/x86/kernel/apic/apic.c | 20 ++++++++++---------- arch/x86/kernel/apic/apic_noop.c | 4 ++-- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/ipi.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 2 +- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- arch/x86/kernel/devicetree.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/oprofile/nmi_int.c | 2 +- arch/x86/pci/xen.c | 2 +- drivers/cpufreq/longhaul.c | 2 +- drivers/iommu/irq_remapping.c | 2 +- 19 files changed, 32 insertions(+), 33 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e280..54c17455600ef 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1518,7 +1518,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) static void __init pmu_check_apic(void) { - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8f58cd215f6d4..c532961c7439d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,7 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index d0afb05c84fc1..f70604125286b 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -5,7 +5,7 @@ static inline bool arch_irq_work_has_interrupt(void) { - return cpu_has_apic; + return boot_cpu_has(X86_FEATURE_APIC); } #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c2f1ef6ca236..2522e564269e2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -136,7 +136,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) { struct acpi_table_madt *madt = NULL; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -EINVAL; madt = (struct acpi_table_madt *)table; @@ -951,7 +951,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) { int count; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -979,7 +979,7 @@ static int __init acpi_parse_madt_lapic_entries(void) int ret; struct acpi_subtable_proc madt_proc[2]; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -1125,7 +1125,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (acpi_disabled || acpi_noirq) return -ENODEV; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0b6509f1a4fe9..60078a67d7e36 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1085,7 +1085,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1134,7 +1134,7 @@ void __init init_bsp_APIC(void) * Don't do the setup now if we have a SMP BIOS as the * through-I/O-APIC virtual wire mode might be active. */ - if (smp_found_config || !cpu_has_apic) + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) return; /* @@ -1445,7 +1445,7 @@ static void __x2apic_disable(void) { u64 msr; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return; rdmsrl(MSR_IA32_APICBASE, msr); @@ -1632,7 +1632,7 @@ void __init enable_IR_x2apic(void) */ static int __init detect_init_APIC(void) { - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); return -1; } @@ -1711,14 +1711,14 @@ static int __init detect_init_APIC(void) goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) + (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) break; goto no_apic; default: goto no_apic; } - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { /* * Over-ride BIOS and try to enable the local APIC only if * "lapic" specified. @@ -2233,19 +2233,19 @@ int __init APIC_init_uniprocessor(void) return -1; } #ifdef CONFIG_X86_64 - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { disable_apic = 1; pr_info("Apic disabled by BIOS\n"); return -1; } #else - if (!smp_found_config && !cpu_has_apic) + if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) return -1; /* * Complain if the BIOS pretends there is one. */ - if (!cpu_has_apic && + if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); @@ -2426,7 +2426,7 @@ static void apic_pm_activate(void) static int __init init_lapic_sysfs(void) { /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) register_syscore_ops(&lapic_syscore_ops); return 0; diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 331a7a07c48fe..13d19ed585142 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,13 +100,13 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); return 0; } static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE(cpu_has_apic && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } struct apic apic_noop = { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fdb0fbfb1197a..84e33ff5a6d59 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1454,7 +1454,7 @@ void native_disable_io_apic(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 28bde88b0085d..2a0f225afebd5 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -230,7 +230,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ad59d70bcb1a6..26d3ccc63e407 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -943,7 +943,7 @@ static int __init print_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return 0; print_local_APICs(show_lapic); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 19d7dcfc8b3e1..54f7b44dcf01f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -565,9 +565,9 @@ static void early_init_amd(struct cpuinfo_x86 *c) * can safely set X86_FEATURE_EXTD_APICID unconditionally for families * after 16h. */ - if (cpu_has_apic && c->x86 > 0x16) { + if (boot_cpu_has(X86_FEATURE_APIC) && c->x86 > 0x16) { set_cpu_cap(c, X86_FEATURE_EXTD_APICID); - } else if (cpu_has_apic && c->x86 >= 0xf) { + } else if (boot_cpu_has(X86_FEATURE_APIC) && c->x86 >= 0xf) { /* check CPU config space for extended APIC ID */ unsigned int val; val = read_pci_config(0, 24, 0, 0x68); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f71a34944b560..1d5582259b20b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -281,7 +281,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && (c->x86_mask < 0x6 || c->x86_mask == 0xb)) set_cpu_bug(c, X86_BUG_11AP); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1e8bb6c94f14c..1defb8ea882c0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -84,7 +84,7 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 0b445c2ff735d..615793321c493 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -447,7 +447,7 @@ asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) /* Thermal monitoring depends on APIC, ACPI and clock modulation */ static int intel_thermal_supported(struct cpuinfo_x86 *c) { - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return 0; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 1f4acd68b98bc..3fe45f84ced44 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -151,7 +151,7 @@ static void __init dtb_lapic_setup(void) return; /* Did the boot loader setup the local APIC ? */ - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { if (apic_force_enable(r.start)) return; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a2065d3b3b396..1fe4130b14d92 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1231,7 +1231,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If we couldn't find a local APIC, then get out of here now! */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && - !cpu_has_apic) { + !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 25171e9595f74..28c04123b6dda 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -700,7 +700,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; if (force_cpu_type == timer) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index beac4dfdade6c..4bd08b0fc8ea1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -445,7 +445,7 @@ void __init xen_msi_init(void) uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || - ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic)) + ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC))) return; } diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index 0f6b229afcb9e..247bfa8eaddbf 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -945,7 +945,7 @@ static int __init longhaul_init(void) } #endif #ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { + if (boot_cpu_has(X86_FEATURE_APIC)) { printk(KERN_ERR PFX "APIC detected. Longhaul is currently " "broken in this configuration.\n"); return -ENODEV; diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 8adaaeae32681..49721b4e1975c 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -36,7 +36,7 @@ static void irq_remapping_disable_io_apic(void) * As this gets called during crash dump, keep this simple for * now. */ - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(0); } -- GitLab From 01f8fd7379149fb9a4046e76617958bf771f856f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:25:01 +0200 Subject: [PATCH 206/705] x86/cpufeature: Replace cpu_has_fxsr with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459801503-15600-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/fpu/init.c | 6 +++--- arch/x86/kernel/fpu/regset.c | 13 ++++++++----- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index c532961c7439d..526381a145469 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,7 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0e7859f9aedc6..1551b28398a4a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -224,7 +224,7 @@ void fpstate_init(union fpregs_state *state) memset(state, 0, xstate_size); - if (cpu_has_fxsr) + if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); @@ -508,7 +508,7 @@ void fpu__clear(struct fpu *fpu) static inline unsigned short get_fpu_cwd(struct fpu *fpu) { - if (cpu_has_fxsr) { + if (boot_cpu_has(X86_FEATURE_FXSR)) { return fpu->state.fxsave.cwd; } else { return (unsigned short)fpu->state.fsave.cwd; @@ -517,7 +517,7 @@ static inline unsigned short get_fpu_cwd(struct fpu *fpu) static inline unsigned short get_fpu_swd(struct fpu *fpu) { - if (cpu_has_fxsr) { + if (boot_cpu_has(X86_FEATURE_FXSR)) { return fpu->state.fxsave.swd; } else { return (unsigned short)fpu->state.fsave.swd; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 3a84275f012e8..aacfd7a82cec5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -29,7 +29,7 @@ static void fpu__init_cpu_generic(void) unsigned long cr0; unsigned long cr4_mask = 0; - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) cr4_mask |= X86_CR4_OSFXSR; if (boot_cpu_has(X86_FEATURE_XMM)) cr4_mask |= X86_CR4_OSXMMEXCPT; @@ -106,7 +106,7 @@ static void __init fpu__init_system_mxcsr(void) { unsigned int mask = 0; - if (cpu_has_fxsr) { + if (boot_cpu_has(X86_FEATURE_FXSR)) { /* Static because GCC does not get 16-byte stack alignment right: */ static struct fxregs_state fxregs __initdata; @@ -221,7 +221,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct swregs_state); } else { - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) xstate_size = sizeof(struct fxregs_state); else xstate_size = sizeof(struct fregs_state); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 8bd1c003942aa..4cff7af735c5e 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -21,7 +21,10 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r { struct fpu *target_fpu = &target->thread.fpu; - return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + return regset->n; + else + return 0; } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, @@ -30,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -47,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; int ret; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -278,7 +281,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); @@ -309,7 +312,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); -- GitLab From d366bf7eb99d0644e47ecd52c184d7ad95df02f2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:25:02 +0200 Subject: [PATCH 207/705] x86/cpufeature: Replace cpu_has_xsave with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1459801503-15600-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/kernel/fpu/regset.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 8 ++++---- arch/x86/kernel/signal.c | 4 ++-- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/x86.c | 12 ++++++------ 7 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 0552884da18db..2f29f4e407c31 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -357,7 +357,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 526381a145469..732a00f12ac0f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,7 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) /* * Do not add any more of those clumsy macros - use static_cpu_has() for diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 4cff7af735c5e..bc5e76c1d7c5c 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -68,7 +68,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return ret; @@ -82,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -111,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -328,7 +328,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bit in the xsave header, indicating the * presence of FP. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b48ef35b28d4f..18b9fd809fe78 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -190,7 +190,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!cpu_has_xsave || !xfeatures_mask) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; cr4_set_bits(X86_CR4_OSXSAVE); @@ -316,7 +316,7 @@ static void __init setup_init_fpu_buf(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); @@ -630,7 +630,7 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); return; } @@ -678,7 +678,7 @@ void fpu__resume_cpu(void) /* * Restore XCR0 on xsave capable CPUs: */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 548ddf7d6fd20..6408c09bbcd4a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -391,7 +391,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(&frame->uc, &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); @@ -442,7 +442,7 @@ static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8efb839948e51..a056b72c2f33b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -75,7 +75,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return 0; /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { + if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) { best->ecx &= ~F(OSXSAVE); if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= F(OSXSAVE); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 742d0f7d3556e..4eb2fca335c97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2612,7 +2612,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_MCE_BANKS; break; case KVM_CAP_XCRS: - r = cpu_has_xsave; + r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; @@ -3122,7 +3122,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); } else { @@ -3140,7 +3140,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* * Here we allow setting states that are not present in * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility @@ -3161,7 +3161,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; return; } @@ -3177,7 +3177,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) @@ -5866,7 +5866,7 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); -- GitLab From 782511b00f749cfebc0cb5d6ce960de5410c221d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Apr 2016 22:25:03 +0200 Subject: [PATCH 208/705] x86/cpufeature: Replace cpu_has_xsaves with boot_cpu_has() usage Signed-off-by: Borislav Petkov Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459801503-15600-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 6 ------ arch/x86/kernel/fpu/xstate.c | 10 +++++----- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 732a00f12ac0f..07c942d846628 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,12 +118,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) -/* - * Do not add any more of those clumsy macros - use static_cpu_has() for - * fast paths and boot_cpu_has() otherwise! - */ - #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 18b9fd809fe78..4ea2a59483c7b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -280,7 +280,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[0] = 0; xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (xfeature_enabled(i)) { xstate_comp_offsets[i] = xstate_offsets[i]; @@ -322,7 +322,7 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (cpu_has_xsaves) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) { init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; init_fpstate.xsave.header.xfeatures = xfeatures_mask; } @@ -417,7 +417,7 @@ static int xfeature_size(int xfeature_nr) */ static int using_compacted_format(void) { - return cpu_has_xsaves; + return boot_cpu_has(X86_FEATURE_XSAVES); } static void __xstate_dump_leaves(void) @@ -549,7 +549,7 @@ static unsigned int __init calculate_xstate_size(void) unsigned int eax, ebx, ecx, edx; unsigned int calculated_xstate_size; - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by @@ -667,7 +667,7 @@ void __init fpu__init_system_xstate(void) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, xstate_size, - cpu_has_xsaves ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); } /* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee1c8a93871c5..d5908bde93429 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3386,7 +3386,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4eb2fca335c97..33102ded1398c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3095,7 +3095,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) /* Set XSTATE_BV and possibly XCOMP_BV. */ xsave->header.xfeatures = xstate_bv; - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -7292,7 +7292,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static void fx_init(struct kvm_vcpu *vcpu) { fpstate_init(&vcpu->arch.guest_fpu.state); - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; -- GitLab From 78df526c74a4db696e1e058b9869471937d0773b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:50 +0200 Subject: [PATCH 209/705] x86/fpu/regset: Replace static_cpu_has() usage with boot_cpu_has() fpregs_{g,s}et() are not sizzling-hot paths to justify the need for static_cpu_has(). Use the normal boot_cpu_has() helper. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index bc5e76c1d7c5c..81422dfb152b7 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -278,7 +278,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); if (!boot_cpu_has(X86_FEATURE_FXSR)) @@ -309,7 +309,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); fpstate_sanitize_xstate(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); if (!boot_cpu_has(X86_FEATURE_FXSR)) -- GitLab From 425d8c2fc5e6dddbad083502bb77c7beae545620 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:51 +0200 Subject: [PATCH 210/705] x86/cpu: Simplify extended APIC ID detection on AMD Both if-branches are under if (boot_cpu_has(X86_FEATURE_APIC)), unify them. Also, simplify the test for bits: - 17 ("ApicExtBrdCst: APIC extended broadcast enable") and - 18 ("ApicExtId: APIC extended ID enable.") in "D18F0x68 Link Transaction Control." No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 54f7b44dcf01f..c343a54bed396 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -565,14 +565,17 @@ static void early_init_amd(struct cpuinfo_x86 *c) * can safely set X86_FEATURE_EXTD_APICID unconditionally for families * after 16h. */ - if (boot_cpu_has(X86_FEATURE_APIC) && c->x86 > 0x16) { - set_cpu_cap(c, X86_FEATURE_EXTD_APICID); - } else if (boot_cpu_has(X86_FEATURE_APIC) && c->x86 >= 0xf) { - /* check CPU config space for extended APIC ID */ - unsigned int val; - val = read_pci_config(0, 24, 0, 0x68); - if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + if (boot_cpu_has(X86_FEATURE_APIC)) { + if (c->x86 > 0x16) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + else if (c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ + unsigned int val; + + val = read_pci_config(0, 24, 0, 0x68); + if ((val >> 17 & 0x3) == 0x3) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } } #endif -- GitLab From a841cca74ea7612508aee161c89987b2646ed769 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:52 +0200 Subject: [PATCH 211/705] x86/tsc: Do not check X86_FEATURE_CONSTANT_TSC in notifier call ... because the notifier-registering routine already does that. Also, rename cpufreq_tsc() init call to something more telling. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a0346bc518335..5bb702c77e8fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -922,9 +922,6 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_freqs *freq = data; unsigned long *lpj; - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - lpj = &boot_cpu_data.loops_per_jiffy; #ifdef CONFIG_SMP if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -954,7 +951,7 @@ static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; -static int __init cpufreq_tsc(void) +static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; @@ -965,7 +962,7 @@ static int __init cpufreq_tsc(void) return 0; } -core_initcall(cpufreq_tsc); +core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ -- GitLab From eff4677e9fb9b680d1d5f6ba079116548d072b7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:53 +0200 Subject: [PATCH 212/705] x86/tsc: Save an indentation level in recalibrate_cpu_khz() ... by flipping the check. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5bb702c77e8fe..38ba6de56edec 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -834,15 +834,15 @@ int recalibrate_cpu_khz(void) #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; - if (boot_cpu_has(X86_FEATURE_TSC)) { - tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else + if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + + tsc_khz = x86_platform.calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + + return 0; #else return -ENODEV; #endif -- GitLab From de82fbc3823b7b15ee03466ebfb1c5ec7cc1a941 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:54 +0200 Subject: [PATCH 213/705] x86/fpu: Remove check_fpu() indirection Rename it to fpu__init_check_bugs() and do the CPU feature check at entry, thus getting rid of the old fpu__init_check_bugs() wrapper. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/bugs.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 224b5ec521959..aad34aafc0e08 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -21,11 +21,15 @@ static double __initdata y = 3145727.0; * We should really only care about bugs here * anyway. Not features. */ -static void __init check_fpu(void) +void __init fpu__init_check_bugs(void) { u32 cr0_saved; s32 fdiv_bug; + /* kernel_fpu_begin/end() relies on patched alternative instructions. */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + return; + /* We might have CR0::TS set already, clear it: */ cr0_saved = read_cr0(); write_cr0(cr0_saved & ~X86_CR0_TS); @@ -59,13 +63,3 @@ static void __init check_fpu(void) pr_warn("Hmm, FPU with FDIV bug\n"); } } - -void __init fpu__init_check_bugs(void) -{ - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (boot_cpu_has(X86_FEATURE_FPU)) - check_fpu(); -} -- GitLab From 6aa6dbfced51dec6cde159c6167ad3dad6add823 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2016 08:29:55 +0200 Subject: [PATCH 214/705] x86/fpu: Get rid of x87 math exception helpers ... and integrate their functionality into their single user fpu__exception_code(). No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459837795-2588-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 44 +++++++++++--------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1551b28398a4a..97027545a72dc 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -506,33 +506,6 @@ void fpu__clear(struct fpu *fpu) * x87 math exception handling: */ -static inline unsigned short get_fpu_cwd(struct fpu *fpu) -{ - if (boot_cpu_has(X86_FEATURE_FXSR)) { - return fpu->state.fxsave.cwd; - } else { - return (unsigned short)fpu->state.fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct fpu *fpu) -{ - if (boot_cpu_has(X86_FEATURE_FXSR)) { - return fpu->state.fxsave.swd; - } else { - return (unsigned short)fpu->state.fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) -{ - if (boot_cpu_has(X86_FEATURE_XMM)) { - return fpu->state.fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; @@ -547,10 +520,15 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception + * fully reproduce the context of the exception. */ - cwd = get_fpu_cwd(fpu); - swd = get_fpu_swd(fpu); + if (boot_cpu_has(X86_FEATURE_FXSR)) { + cwd = fpu->state.fxsave.cwd; + swd = fpu->state.fxsave.swd; + } else { + cwd = (unsigned short)fpu->state.fsave.cwd; + swd = (unsigned short)fpu->state.fsave.swd; + } err = swd & ~cwd; } else { @@ -560,7 +538,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ - unsigned short mxcsr = get_fpu_mxcsr(fpu); + unsigned short mxcsr = MXCSR_DEFAULT; + + if (boot_cpu_has(X86_FEATURE_XMM)) + mxcsr = fpu->state.fxsave.mxcsr; + err = ~(mxcsr >> 7) & mxcsr; } -- GitLab From 7bbcdb1ca4d2fd69094ee89c18601b396531ca9f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:32 -0700 Subject: [PATCH 215/705] x86/head: Pass a real pt_regs and trapnr to early_fixup_exception() early_fixup_exception() is limited by the fact that it doesn't have a real struct pt_regs. Change both the 32-bit and 64-bit asm and the C code to pass and accept a real pt_regs. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/e3fb680fcfd5e23e38237e8328b64a25cc121d37.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 2 +- arch/x86/kernel/head_32.S | 74 +++++++++++++++++++++++----------- arch/x86/kernel/head_64.S | 68 +++++++++++++++++-------------- arch/x86/mm/extable.c | 6 +-- 4 files changed, 92 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a969ae607be83..b6fb311b7d757 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -110,7 +110,7 @@ struct exception_table_entry { extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); -extern int early_fixup_exception(unsigned long *ip); +extern int early_fixup_exception(struct pt_regs *regs, int trapnr); /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 54cdbd2003fe0..0904536cd45c6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -568,29 +568,64 @@ early_idt_handler_common: je hlt_loop incl %ss:early_recursion_flag - push %eax # 16(%esp) - push %ecx # 12(%esp) - push %edx # 8(%esp) - push %ds # 4(%esp) - push %es # 0(%esp) - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es + /* The vector number is in pt_regs->gs */ - cmpl $(__KERNEL_CS),32(%esp) + cld + pushl %fs /* pt_regs->fs */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %es /* pt_regs->es */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %ds /* pt_regs->ds */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %eax /* pt_regs->ax */ + pushl %ebp /* pt_regs->bp */ + pushl %edi /* pt_regs->di */ + pushl %esi /* pt_regs->si */ + pushl %edx /* pt_regs->dx */ + pushl %ecx /* pt_regs->cx */ + pushl %ebx /* pt_regs->bx */ + + /* Fix up DS and ES */ + movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + + /* Load the vector number into EDX */ + movl PT_GS(%esp), %edx + + /* Load GS into pt_regs->gs and clear high bits */ + movw %gs, PT_GS(%esp) + movw $0, PT_GS+2(%esp) + + cmpl $(__KERNEL_CS),PT_CS(%esp) jne 10f - leal 28(%esp),%eax # Pointer to %eip - call early_fixup_exception - andl %eax,%eax - jnz ex_entry /* found an exception entry */ + movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ + call early_fixup_exception + andl %eax,%eax + jz 10f /* Exception wasn't fixed up */ + + popl %ebx /* pt_regs->bx */ + popl %ecx /* pt_regs->cx */ + popl %edx /* pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + popl %ds /* pt_regs->ds */ + popl %es /* pt_regs->es */ + popl %fs /* pt_regs->fs */ + popl %gs /* pt_regs->gs */ + decl %ss:early_recursion_flag + addl $4, %esp /* pop pt_regs->orig_ax */ + iret 10: #ifdef CONFIG_PRINTK xorl %eax,%eax - movw %ax,2(%esp) /* clean up the segment values on some cpus */ - movw %ax,6(%esp) - movw %ax,34(%esp) + movw %ax,PT_FS+2(%esp) /* clean up the segment values on some cpus */ + movw %ax,PT_DS+2(%esp) + movw %ax,PT_ES+2(%esp) leal 40(%esp),%eax pushl %eax /* %esp before the exception */ pushl %ebx @@ -608,13 +643,6 @@ hlt_loop: hlt jmp hlt_loop -ex_entry: - pop %es - pop %ds - pop %edx - pop %ecx - pop %eax - decl %ss:early_recursion_flag .Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 22fbf9df61bb4..9e8636d2ceddb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -20,6 +20,7 @@ #include #include #include +#include "../entry/calling.h" #ifdef CONFIG_PARAVIRT #include @@ -357,39 +358,52 @@ early_idt_handler_common: jz 1f incl early_recursion_flag(%rip) - pushq %rax # 64(%rsp) - pushq %rcx # 56(%rsp) - pushq %rdx # 48(%rsp) - pushq %rsi # 40(%rsp) - pushq %rdi # 32(%rsp) - pushq %r8 # 24(%rsp) - pushq %r9 # 16(%rsp) - pushq %r10 # 8(%rsp) - pushq %r11 # 0(%rsp) - - cmpl $__KERNEL_CS,96(%rsp) + /* The vector number is currently in the pt_regs->di slot. */ + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* RSI = vector number */ + movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->bx */ + pushq %rbp /* pt_regs->bp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + + cmpl $__KERNEL_CS,CS(%rsp) jne 11f - cmpl $14,72(%rsp) # Page fault? + cmpq $14,%rsi /* Page fault? */ jnz 10f - GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ call early_make_pgtable andl %eax,%eax - jz 20f # All good + jz 20f /* All good */ 10: - leaq 88(%rsp),%rdi # Pointer to %rip + movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry 11: #ifdef CONFIG_EARLY_PRINTK - GET_CR2_INTO(%r9) # can clobber any volatile register if pv - movl 80(%rsp),%r8d # error code - movl 72(%rsp),%esi # vector number - movl 96(%rsp),%edx # %cs - movq 88(%rsp),%rcx # %rip + /* + * On paravirt kernels, GET_CR2_INTO clobbers callee-clobbered regs. + * We only care about RSI, so we need to save it. + */ + movq %rsi,%rbx /* Save vector number */ + GET_CR2_INTO(%r9) + movq ORIG_RAX(%rsp),%r8 /* error code */ + movq %rbx,%rsi /* vector number */ + movq CS(%rsp),%rdx + movq RIP(%rsp),%rcx xorl %eax,%eax leaq early_idt_msg(%rip),%rdi call early_printk @@ -398,24 +412,16 @@ early_idt_handler_common: call dump_stack #ifdef CONFIG_KALLSYMS leaq early_idt_ripmsg(%rip),%rdi - movq 40(%rsp),%rsi # %rip again + movq RIP(%rsp),%rsi # %rip again call __print_symbol #endif #endif /* EARLY_PRINTK */ 1: hlt jmp 1b -20: # Exception table entry found or page table generated - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rdi - popq %rsi - popq %rdx - popq %rcx - popq %rax +20: /* Exception table entry found or page table generated */ decl early_recursion_flag(%rip) + jmp restore_regs_and_iret .Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 82447b3fba380..1366e067a796f 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -83,13 +83,13 @@ int fixup_exception(struct pt_regs *regs, int trapnr) } /* Restricted version used during very early boot */ -int __init early_fixup_exception(unsigned long *ip) +int __init early_fixup_exception(struct pt_regs *regs, int trapnr) { const struct exception_table_entry *e; unsigned long new_ip; ex_handler_t handler; - e = search_exception_tables(*ip); + e = search_exception_tables(regs->ip); if (!e) return 0; @@ -100,6 +100,6 @@ int __init early_fixup_exception(unsigned long *ip) if (handler != ex_handler_default) return 0; - *ip = new_ip; + regs->ip = new_ip; return 1; } -- GitLab From 0d0efc07f3df677d7622bb760f8e2920b5e33f42 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:33 -0700 Subject: [PATCH 216/705] x86/head: Move the early NMI fixup into C C is nicer than asm. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/dd068269f8d59fe44e9e43a50d0efd67da65c2b5.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 7 ------- arch/x86/kernel/head_64.S | 6 ------ arch/x86/mm/extable.c | 5 +++++ 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0904536cd45c6..184291c72c22d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -561,9 +561,6 @@ early_idt_handler_common: */ cld - cmpl $2,(%esp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - cmpl $2,%ss:early_recursion_flag je hlt_loop incl %ss:early_recursion_flag @@ -642,10 +639,6 @@ early_idt_handler_common: hlt_loop: hlt jmp hlt_loop - -.Lis_nmi: - addl $8,%esp /* drop vector number and error code */ - iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9e8636d2ceddb..230843781dd43 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -351,9 +351,6 @@ early_idt_handler_common: */ cld - cmpl $2,(%rsp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) @@ -422,9 +419,6 @@ early_idt_handler_common: 20: /* Exception table entry found or page table generated */ decl early_recursion_flag(%rip) jmp restore_regs_and_iret -.Lis_nmi: - addq $16,%rsp # drop vector number and error code - INTERRUPT_RETURN ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1366e067a796f..4be041910c2f6 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,6 @@ #include #include +#include typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -89,6 +90,10 @@ int __init early_fixup_exception(struct pt_regs *regs, int trapnr) unsigned long new_ip; ex_handler_t handler; + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return 1; + e = search_exception_tables(regs->ip); if (!e) return 0; -- GitLab From 0e861fbb5bda79b871341ef2a9a8059765cbe8a4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:34 -0700 Subject: [PATCH 217/705] x86/head: Move early exception panic code into early_fixup_exception() This removes a bunch of assembly and adds some C code instead. It changes the actual printouts on both 32-bit and 64-bit kernels, but they still seem okay. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/4085070316fc3ab29538d3fcfe282648d1d4ee2e.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 2 +- arch/x86/kernel/head_32.S | 49 ++++------------------------------ arch/x86/kernel/head_64.S | 45 ++----------------------------- arch/x86/mm/extable.c | 29 ++++++++++++++++---- 4 files changed, 32 insertions(+), 93 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index b6fb311b7d757..d794fd1f582f1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -110,7 +110,7 @@ struct exception_table_entry { extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); -extern int early_fixup_exception(struct pt_regs *regs, int trapnr); +extern void early_fixup_exception(struct pt_regs *regs, int trapnr); /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 184291c72c22d..6770865fde6b6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -561,8 +561,6 @@ early_idt_handler_common: */ cld - cmpl $2,%ss:early_recursion_flag - je hlt_loop incl %ss:early_recursion_flag /* The vector number is in pt_regs->gs */ @@ -594,13 +592,8 @@ early_idt_handler_common: movw %gs, PT_GS(%esp) movw $0, PT_GS+2(%esp) - cmpl $(__KERNEL_CS),PT_CS(%esp) - jne 10f - movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception - andl %eax,%eax - jz 10f /* Exception wasn't fixed up */ popl %ebx /* pt_regs->bx */ popl %ecx /* pt_regs->cx */ @@ -616,29 +609,6 @@ early_idt_handler_common: decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret - -10: -#ifdef CONFIG_PRINTK - xorl %eax,%eax - movw %ax,PT_FS+2(%esp) /* clean up the segment values on some cpus */ - movw %ax,PT_DS+2(%esp) - movw %ax,PT_ES+2(%esp) - leal 40(%esp),%eax - pushl %eax /* %esp before the exception */ - pushl %ebx - pushl %ebp - pushl %esi - pushl %edi - movl %cr2,%eax - pushl %eax - pushl (20+6*4)(%esp) /* trapno */ - pushl $fault_msg - call printk -#endif - call dump_stack -hlt_loop: - hlt - jmp hlt_loop ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ @@ -674,10 +644,14 @@ ignore_int: popl %eax #endif iret + +hlt_loop: + hlt + jmp hlt_loop ENDPROC(ignore_int) __INITDATA .align 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 __REFDATA @@ -742,19 +716,6 @@ __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -fault_msg: -/* fault info: */ - .ascii "BUG: Int %d: CR2 %p\n" -/* regs pushed in early_idt_handler: */ - .ascii " EDI %p ESI %p EBP %p EBX %p\n" - .ascii " ESP %p ES %p DS %p\n" - .ascii " EDX %p ECX %p EAX %p\n" -/* fault frame: */ - .ascii " vec %p err %p EIP %p CS %p flg %p\n" - .ascii "Stack: %p %p %p %p %p %p %p %p\n" - .ascii " %p %p %p %p %p %p %p %p\n" - .asciz " %p %p %p %p %p %p %p %p\n" - #include "../../x86/xen/xen-head.S" /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 230843781dd43..3de91a7e6c99c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -351,8 +351,6 @@ early_idt_handler_common: */ cld - cmpl $2,early_recursion_flag(%rip) - jz 1f incl early_recursion_flag(%rip) /* The vector number is currently in the pt_regs->di slot. */ @@ -373,9 +371,6 @@ early_idt_handler_common: pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ - cmpl $__KERNEL_CS,CS(%rsp) - jne 11f - cmpq $14,%rsi /* Page fault? */ jnz 10f GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ @@ -386,37 +381,8 @@ early_idt_handler_common: 10: movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception - andl %eax,%eax - jnz 20f # Found an exception entry - -11: -#ifdef CONFIG_EARLY_PRINTK - /* - * On paravirt kernels, GET_CR2_INTO clobbers callee-clobbered regs. - * We only care about RSI, so we need to save it. - */ - movq %rsi,%rbx /* Save vector number */ - GET_CR2_INTO(%r9) - movq ORIG_RAX(%rsp),%r8 /* error code */ - movq %rbx,%rsi /* vector number */ - movq CS(%rsp),%rdx - movq RIP(%rsp),%rcx - xorl %eax,%eax - leaq early_idt_msg(%rip),%rdi - call early_printk - cmpl $2,early_recursion_flag(%rip) - jz 1f - call dump_stack -#ifdef CONFIG_KALLSYMS - leaq early_idt_ripmsg(%rip),%rdi - movq RIP(%rsp),%rsi # %rip again - call __print_symbol -#endif -#endif /* EARLY_PRINTK */ -1: hlt - jmp 1b -20: /* Exception table entry found or page table generated */ +20: decl early_recursion_flag(%rip) jmp restore_regs_and_iret ENDPROC(early_idt_handler_common) @@ -424,16 +390,9 @@ ENDPROC(early_idt_handler_common) __INITDATA .balign 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 -#ifdef CONFIG_EARLY_PRINTK -early_idt_msg: - .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" -early_idt_ripmsg: - .asciz "RIP %s\n" -#endif /* CONFIG_EARLY_PRINTK */ - #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ GLOBAL(name) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4be041910c2f6..da442f37ca8b5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -83,8 +83,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return handler(e, regs, trapnr); } +extern unsigned int early_recursion_flag; + /* Restricted version used during very early boot */ -int __init early_fixup_exception(struct pt_regs *regs, int trapnr) +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { const struct exception_table_entry *e; unsigned long new_ip; @@ -92,19 +94,36 @@ int __init early_fixup_exception(struct pt_regs *regs, int trapnr) /* Ignore early NMIs. */ if (trapnr == X86_TRAP_NMI) - return 1; + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + if (regs->cs != __KERNEL_CS) + goto fail; e = search_exception_tables(regs->ip); if (!e) - return 0; + goto fail; new_ip = ex_fixup_addr(e); handler = ex_fixup_handler(e); /* special handling not supported during early boot */ if (handler != ex_handler_default) - return 0; + goto fail; regs->ip = new_ip; - return 1; + return; + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); } -- GitLab From ae7ef45e12354a1e2f6013b46df0c9f5bbb6ffbe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:35 -0700 Subject: [PATCH 218/705] x86/traps: Enable all exception handler callbacks early Now that early_fixup_exception() has pt_regs, we can just call fixup_exception() from it. This will make fancy exception handlers work early. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/20fc047d926150cb08cb9b9f2923519b07ec1a15.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/extable.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index da442f37ca8b5..061a237583547 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -88,10 +88,6 @@ extern unsigned int early_recursion_flag; /* Restricted version used during very early boot */ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *e; - unsigned long new_ip; - ex_handler_t handler; - /* Ignore early NMIs. */ if (trapnr == X86_TRAP_NMI) return; @@ -102,19 +98,8 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) if (regs->cs != __KERNEL_CS) goto fail; - e = search_exception_tables(regs->ip); - if (!e) - goto fail; - - new_ip = ex_fixup_addr(e); - handler = ex_fixup_handler(e); - - /* special handling not supported during early boot */ - if (handler != ex_handler_default) - goto fail; - - regs->ip = new_ip; - return; + if (fixup_exception(regs, trapnr)) + return; fail: early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", -- GitLab From c2ee03b2a94d7ba692cf6206bbe069d5bfcc20ed Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:36 -0700 Subject: [PATCH 219/705] x86/paravirt: Add _safe to the read_ms()r and write_msr() PV callbacks These callbacks match the _safe variants, so name them accordingly. This will make room for unsafe PV callbacks. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/9ee3fb6a196a514c93325bdfa15594beecf04876.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 33 ++++++++++++++------------- arch/x86/include/asm/paravirt_types.h | 8 +++---- arch/x86/kernel/paravirt.c | 4 ++-- arch/x86/xen/enlighten.c | 4 ++-- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 601f1b8f9961a..81ef2d5c2a248 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -130,34 +130,35 @@ static inline void wbinvd(void) #define get_kernel_rpl() (pv_info.kernel_rpl) -static inline u64 paravirt_read_msr(unsigned msr, int *err) +static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { - return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); + return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err); } -static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) +static inline int paravirt_write_msr_safe(unsigned msr, + unsigned low, unsigned high) { - return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); + return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high); } /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ + u64 _l = paravirt_read_msr_safe(msr, &_err); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ - paravirt_write_msr(msr, val1, val2); \ + paravirt_write_msr_safe(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ int _err; \ - val = paravirt_read_msr(msr, &_err); \ + val = paravirt_read_msr_safe(msr, &_err); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) @@ -165,23 +166,23 @@ static inline void wrmsrl(unsigned msr, u64 val) wrmsr(msr, (u32)val, (u32)(val>>32)); } -#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) +#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ -#define rdmsr_safe(msr, a, b) \ -({ \ - int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ - (*a) = (u32)_l; \ - (*b) = _l >> 32; \ - _err; \ +#define rdmsr_safe(msr, a, b) \ +({ \ + int _err; \ + u64 _l = paravirt_read_msr_safe(msr, &_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; - *p = paravirt_read_msr(msr, &err); + *p = paravirt_read_msr_safe(msr, &err); return err; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e8c2326478c8f..09c9e1dd81ce9 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -155,10 +155,10 @@ struct pv_cpu_ops { void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + /* MSR operations. + err = 0/-EIO. wrmsr returns 0/-EIO. */ + u64 (*read_msr_safe)(unsigned int msr, int *err); + int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); u64 (*read_pmc)(int counter); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f08ac28b8136d..8aad95478ae54 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -339,8 +339,8 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .write_cr8 = native_write_cr8, #endif .wbinvd = native_wbinvd, - .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .read_msr_safe = native_read_msr_safe, + .write_msr_safe = native_write_msr_safe, .read_pmc = native_read_pmc, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9b8f1eacc1104..13f756fdcb334 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1222,8 +1222,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, - .read_msr = xen_read_msr_safe, - .write_msr = xen_write_msr_safe, + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, .read_pmc = xen_read_pmc, -- GitLab From fbd704374d111bed16a19261176fa30e2379c87c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:37 -0700 Subject: [PATCH 220/705] x86/msr: Carry on after a non-"safe" MSR access fails This demotes an OOPS and likely panic due to a failed non-"safe" MSR access to a WARN_ONCE() and, for RDMSR, a return value of zero. To be clear, this type of failure should *not* happen. This patch exists to minimize the chance of nasty undebuggable failures happening when a CONFIG_PARAVIRT=y bug in the non-"safe" MSR helpers gets fixed. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/26567b216aae70e795938f4b567eace5a0eb90ba.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 10 ++++++++-- arch/x86/mm/extable.c | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7a79ee2778b3b..25f169c6eb953 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -84,7 +84,10 @@ static inline unsigned long long native_read_msr(unsigned int msr) { DECLARE_ARGS(val, low, high); - asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr)); + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + : EAX_EDX_RET(val, low, high) : "c" (msr)); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); @@ -111,7 +114,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, static inline void native_write_msr(unsigned int msr, unsigned low, unsigned high) { - asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + : : "c" (msr), "a"(low), "d" (high) : "memory"); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 061a237583547..fd9eb98c4f581 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -43,6 +43,33 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_ext); +bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", + (unsigned int)regs->cx); + + /* Pretend that the read succeeded and returned 0. */ + regs->ip = ex_fixup_addr(fixup); + regs->ax = 0; + regs->dx = 0; + return true; +} +EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + +bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", + (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->ax); + + /* Pretend that the write succeeded. */ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); + bool ex_has_fault_handler(unsigned long ip) { const struct exception_table_entry *e; -- GitLab From dd2f4a004b016bbfb64f1de49cb45e66232e40a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:38 -0700 Subject: [PATCH 221/705] x86/paravirt: Add paravirt_{read,write}_msr() This adds paravirt callbacks for unsafe MSR access. On native, they call native_{read,write}_msr(). On Xen, they use xen_{read,write}_msr_safe(). Nothing uses them yet for ease of bisection. The next patch will use them in rdmsrl(), wrmsrl(), etc. I intentionally didn't make them warn on #GP on Xen. I think that should be done separately by the Xen maintainers. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/880eebc5dcd2ad9f310d41345f82061ea500e9fa.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 5 +++-- arch/x86/include/asm/paravirt.h | 11 +++++++++++ arch/x86/include/asm/paravirt_types.h | 10 ++++++++-- arch/x86/kernel/paravirt.c | 2 ++ arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 25f169c6eb953..00050c034a138 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -111,8 +111,9 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, return EAX_EDX_VAL(val, low, high); } -static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +/* Can be uninlined because referenced by paravirt */ +notrace static inline void native_write_msr(unsigned int msr, + unsigned low, unsigned high) { asm volatile("1: wrmsr\n" "2:\n" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 81ef2d5c2a248..97839fa8b8aae 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -130,6 +130,17 @@ static inline void wbinvd(void) #define get_kernel_rpl() (pv_info.kernel_rpl) +static inline u64 paravirt_read_msr(unsigned msr) +{ + return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr); +} + +static inline void paravirt_write_msr(unsigned msr, + unsigned low, unsigned high) +{ + return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); +} + static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 09c9e1dd81ce9..b4a23eafa1b95 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -155,8 +155,14 @@ struct pv_cpu_ops { void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); - /* MSR operations. - err = 0/-EIO. wrmsr returns 0/-EIO. */ + /* Unsafe MSR operations. These will warn or panic on failure. */ + u64 (*read_msr)(unsigned int msr); + void (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + /* + * Safe MSR operations. + * read sets err to 0 or -EIO. write returns 0 or -EIO. + */ u64 (*read_msr_safe)(unsigned int msr, int *err); int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8aad95478ae54..f9583917c7c4f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -339,6 +339,8 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .write_cr8 = native_write_cr8, #endif .wbinvd = native_wbinvd, + .read_msr = native_read_msr, + .write_msr = native_write_msr, .read_msr_safe = native_read_msr_safe, .write_msr_safe = native_write_msr_safe, .read_pmc = native_read_pmc, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 13f756fdcb334..6ab672233ac98 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1092,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +static u64 xen_read_msr(unsigned int msr) +{ + /* + * This will silently swallow a #GP from RDMSR. It may be worth + * changing that. + */ + int err; + + return xen_read_msr_safe(msr, &err); +} + +static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +{ + /* + * This will silently swallow a #GP from WRMSR. It may be worth + * changing that. + */ + xen_write_msr_safe(msr, low, high); +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1222,6 +1242,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, + .read_msr_safe = xen_read_msr_safe, .write_msr_safe = xen_write_msr_safe, -- GitLab From 4985ce15a397e9b6541548efe3b9ffac2dda9127 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:39 -0700 Subject: [PATCH 222/705] x86/paravirt: Make "unsafe" MSR accesses unsafe even if PARAVIRT=y Enabling CONFIG_PARAVIRT had an unintended side effect: rdmsr() turned into rdmsr_safe() and wrmsr() turned into wrmsr_safe(), even on bare metal. Undo that by using the new unsafe paravirt MSR callbacks. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/414fabd6d3527703077c6c2a797223d0a9c3b081.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 97839fa8b8aae..3c731413f1dee 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -152,24 +152,21 @@ static inline int paravirt_write_msr_safe(unsigned msr, return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high); } -/* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ - int _err; \ - u64 _l = paravirt_read_msr_safe(msr, &_err); \ + u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ - paravirt_write_msr_safe(msr, val1, val2); \ + paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ - int _err; \ - val = paravirt_read_msr_safe(msr, &_err); \ + val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) -- GitLab From b828b79fcced0e66492590707649dbfaea6435e6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 2 Apr 2016 07:01:40 -0700 Subject: [PATCH 223/705] x86/msr: Set the return value to zero when native_rdmsr_safe() fails This will cause unchecked native_rdmsr_safe() failures to return deterministic results. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/515fb611449a755312a476cfe11675906e7ddf6c.1459605520.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 00050c034a138..7dc1d8fef7fde 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -101,7 +101,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, asm volatile("2: rdmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" + "3: mov %[fault],%[err]\n\t" + "xorl %%eax, %%eax\n\t" + "xorl %%edx, %%edx\n\t" + "jmp 1b\n\t" ".previous\n\t" _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) -- GitLab From 60a0e2039e3df6c0a2b896bd78af36ff36fb629c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Apr 2016 08:46:22 -0700 Subject: [PATCH 224/705] x86/extable: Add a comment about early exception handlers Borislav asked for a comment explaining why all exception handlers are allowed early. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: KVM list Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/5f1dcd6919f4a5923959a8065cb2c04d9dac1412.1459784772.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/extable.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index fd9eb98c4f581..aaeda3ffaafef 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -125,6 +125,20 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) if (regs->cs != __KERNEL_CS) goto fail; + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * fage faults, for example, are special. + */ if (fixup_exception(regs, trapnr)) return; -- GitLab From 47a541c3e19374ec9f5d3d96730a922e8480dda5 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 1 Apr 2016 17:51:54 -0700 Subject: [PATCH 225/705] x86/platform: Remove unused get_bios_ebda_length() function get_bios_ebda_length() uses min_t() without including linux/kernel.h. This may result in build errors with some configurations. Since the function is not used anywhere in the kernel, let's just drop it. Signed-off-by: Guenter Roeck Cc: Linus Torvalds Cc: Mike Waychison Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459558314-5625-1-git-send-email-linux@roeck-us.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bios_ebda.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index aa6a3170ab5ad..2b00c776f223a 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,27 +17,6 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -/* - * Return the sanitized length of the EBDA in bytes, if it exists. - */ -static inline unsigned int get_bios_ebda_length(void) -{ - unsigned int address; - unsigned int length; - - address = get_bios_ebda(); - if (!address) - return 0; - - /* EBDA length is byte 0 of the EBDA (stored in KiB) */ - length = *(unsigned char *)phys_to_virt(address); - length <<= 10; - - /* Trim the length if it extends beyond 640KiB */ - length = min_t(unsigned int, (640 * 1024) - address, length); - return length; -} - void reserve_ebda_region(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -- GitLab From 91ed140d6c1e168b11bbbddac4f6066f40a0c6b5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Mar 2016 16:21:02 +0200 Subject: [PATCH 226/705] x86/asm: Make sure verify_cpu() has a good stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 04633df0c43d ("x86/cpu: Call verify_cpu() after having entered long mode too") added the call to verify_cpu() for sanitizing CPU configuration. The latter uses the stack minimally and it can happen that we land in startup_64() directly from a 64-bit bootloader. Then we want to use our own, known good stack. Do that. APs don't need this as the trampoline sets up a stack for them. Reported-by: Tom Lendacky Signed-off-by: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Mika Penttilä Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1459434062-31055-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 8 ++++++++ include/asm-generic/vmlinux.lds.h | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3de91a7e6c99c..5df831ef1442f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,14 @@ startup_64: * tables and then reload them. */ + /* + * Setup stack for verify_cpu(). "-8" because stack_start is defined + * this way, see below. Our best guess is a NULL ptr for stack + * termination heuristics and we don't want to break anything which + * might depend on it (kgdb, ...). + */ + leaq (__end_init_task - 8)(%rip), %rsp + /* Sanitize CPU configuration */ call verify_cpu diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 339125bb4d2cf..6a67ab94b5533 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -245,7 +245,9 @@ #define INIT_TASK_DATA(align) \ . = ALIGN(align); \ - *(.data..init_task) + VMLINUX_SYMBOL(__start_init_task) = .; \ + *(.data..init_task) \ + VMLINUX_SYMBOL(__end_init_task) = .; /* * Read only Data -- GitLab From 31d50c551e30923b86a1b5b420920dd1927fa63b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Apr 2016 16:02:08 +0200 Subject: [PATCH 227/705] perf/x86/amd/uncore: Do not register a task ctx for uncore PMUs The new sanity check introduced by: 26657848502b ("perf/core: Verify we have a single perf_hw_context PMU") ... triggered on the AMD uncore driver. Uncore PMUs are per node, they cannot have per-task counters. Fix it. Reported-by: Borislav Petkov Reported-by: Ingo Molnar Tested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@redhat.com Cc: alexander.shishkin@linux.intel.com Cc: eranian@google.com Cc: jolsa@redhat.com Cc: linux-tip-commits@vger.kernel.org Cc: vincent.weaver@maine.edu Link: http://lkml.kernel.org/r/20160404140208.GA3448@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/amd/uncore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 3db9569e658c8..98ac57381bf9b 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -263,6 +263,7 @@ static const struct attribute_group *amd_uncore_attr_groups[] = { }; static struct pmu amd_nb_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_nb", .event_init = amd_uncore_event_init, @@ -274,6 +275,7 @@ static struct pmu amd_nb_pmu = { }; static struct pmu amd_l2_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_l2", .event_init = amd_uncore_event_init, -- GitLab From c78b17e28cc2c2df74264afc408bdc6aaf3fbcc8 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 11 Apr 2016 16:38:33 +0200 Subject: [PATCH 228/705] sched/clock: Remove pointless test in cpu_clock/local_clock In case the HAVE_UNSTABLE_SCHED_CLOCK config is set, the cpu_clock() version checks if sched_clock_stable() is not set and calls sched_clock_cpu(), otherwise it calls sched_clock(). sched_clock_cpu() checks also if sched_clock_stable() is set and, if true, calls sched_clock(). sched_clock() will be called in sched_clock_cpu() if sched_clock_stable() is true. Remove the duplicate test by directly calling sched_clock_cpu() and let the static key act in this function instead. Signed-off-by: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460385514-14700-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index fedb967a98419..30c4b202f0ba0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -375,10 +375,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - if (!sched_clock_stable()) - return sched_clock_cpu(cpu); - - return sched_clock(); + return sched_clock_cpu(cpu); } /* @@ -390,10 +387,7 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - if (!sched_clock_stable()) - return sched_clock_cpu(raw_smp_processor_id()); - - return sched_clock(); + return sched_clock_cpu(raw_smp_processor_id()); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -- GitLab From 2c923e94cd9c6acff3b22f0ae29cfe65e2658b40 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 11 Apr 2016 16:38:34 +0200 Subject: [PATCH 229/705] sched/clock: Make local_clock()/cpu_clock() inline The local_clock/cpu_clock functions were changed to prevent a double identical test with sched_clock_cpu() when HAVE_UNSTABLE_SCHED_CLOCK is set. That resulted in one line functions. As these functions are in all the cases one line functions and in the hot path, it is useful to specify them as static inline in order to give a strong hint to the compiler. After verification, it appears the compiler does not inline them without this hint. Change those functions to static inline. sched_clock_cpu() is called via the inlined local_clock()/cpu_clock() functions from sched.h. So any module code including sched.h will reference sched_clock_cpu(). Thus it must be exported with the EXPORT_SYMBOL_GPL macro. Signed-off-by: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460385514-14700-2-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 32 ++++++++++++++++++++++++++++++-- kernel/sched/clock.c | 42 +----------------------------------------- 2 files changed, 31 insertions(+), 43 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 52c4847b05e28..13c1c1d07270a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2303,8 +2303,6 @@ extern unsigned long long notrace sched_clock(void); /* * See the comment in kernel/sched/clock.c */ -extern u64 cpu_clock(int cpu); -extern u64 local_clock(void); extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); @@ -2323,6 +2321,16 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static inline u64 local_clock(void) +{ + return sched_clock(); +} #else /* * Architectures can set this to 1 if they have specified @@ -2337,6 +2345,26 @@ extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +static inline u64 local_clock(void) +{ + return sched_clock_cpu(raw_smp_processor_id()); +} #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 30c4b202f0ba0..e85a725e5c349 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu) return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { @@ -363,33 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - return sched_clock_cpu(cpu); -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -404,22 +378,8 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -u64 cpu_clock(int cpu) -{ - return sched_clock(); -} - -u64 local_clock(void) -{ - return sched_clock(); -} - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); - /* * Running clock - returns the time that has elapsed while a guest has been * running. -- GitLab From bd92883051a0228cc34996b8e766111ba10c9aac Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 6 Apr 2016 21:59:50 +1000 Subject: [PATCH 230/705] sched/cpuacct: Check for NULL when using task_pt_regs() task_pt_regs() can return NULL for kernel threads, so add a check. This fixes an oops at boot on ppc64. Reported-and-Tested-by: Srikar Dronamraju Tested-by: Zhao Lei Signed-off-by: Anton Blanchard Acked-by: Zhao Lei Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Thomas Gleixner Cc: efault@gmx.de Cc: htejun@gmail.com Cc: linuxppc-dev@lists.ozlabs.org Cc: tj@kernel.org Cc: yangds.fnst@cn.fujitsu.com Link: http://lkml.kernel.org/r/20160406215950.04bc3f0b@kryten Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index df947e07aac1d..41f85c4d09387 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -316,12 +316,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index; + int index = CPUACCT_USAGE_SYSTEM; + struct pt_regs *regs = task_pt_regs(tsk); - if (user_mode(task_pt_regs(tsk))) + if (regs && user_mode(regs)) index = CPUACCT_USAGE_USER; - else - index = CPUACCT_USAGE_SYSTEM; rcu_read_lock(); -- GitLab From 202ff9684a912c96e0f2fac65e34280a97ad3611 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Apr 2016 10:11:07 -0300 Subject: [PATCH 231/705] perf trace: Support callchains for --event too We already were able to ask for callchains for a specific event: # trace -e nanosleep --call dwarf --event sched:sched_switch/call-graph=fp/ usleep 1 This would enable tracing just the "nanosleep" syscall, with callchains at syscall exit and would ask the kernel for frame pointer callchains to be enabled for the "sched:sched_switch" tracepoint event, its just that we were not resolving the callchain and printing it in 'perf trace', do it: # trace -e nanosleep --call dwarf --event sched:sched_switch/call-graph=fp/ usleep 1 0.425 ( 0.013 ms): usleep/6718 nanosleep(rqtp: 0x7ffcc1d16e20) ... 0.425 ( ): sched:sched_switch:usleep:6718 [120] S ==> swapper/2:0 [120]) __schedule+0xfe200402 ([kernel.kallsyms]) schedule+0xfe200035 ([kernel.kallsyms]) do_nanosleep+0xfe20006f ([kernel.kallsyms]) hrtimer_nanosleep+0xfe2000dc ([kernel.kallsyms]) sys_nanosleep+0xfe20007a ([kernel.kallsyms]) do_syscall_64+0xfe200062 ([kernel.kallsyms]) return_from_SYSCALL_64+0xfe200000 ([kernel.kallsyms]) __nanosleep+0xffff008b8cbe2010 (/usr/lib64/libc-2.22.so) 0.486 ( 0.073 ms): usleep/6718 ... [continued]: nanosleep()) = 0 __nanosleep+0x10 (/usr/lib64/libc-2.22.so) usleep+0x34 (/usr/lib64/libc-2.22.so) main+0x1eb (/usr/bin/usleep) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) _start+0x29 (/usr/bin/usleep) # Pretty compact, huh? DWARF callchains for raw_syscalls:sys_exit + frame pointer callchains for a tracepoint, if your hardware supports LBR, go wild with /call-graph=lbr/, guess the next step is to lift this from 'perf script': -F, --fields comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr,symoff,period,iregs,brstack,brstacksym,flags Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-2e7yiv5hqdm8jywlmfivvx2v@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 41 ++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 2ec53edcf6492..a6e05e1bb3503 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2114,6 +2114,28 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, return err; } +static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evsel, + struct perf_sample *sample) +{ + struct addr_location al; + /* TODO: user-configurable print_opts */ + const unsigned int print_opts = PRINT_IP_OPT_SYM | + PRINT_IP_OPT_DSO | + PRINT_IP_OPT_UNKNOWN_AS_ADDR; + + if (sample->callchain == NULL) + return 0; + + if (machine__resolve(trace->host, &al, sample) < 0) { + pr_err("Problem processing %s callchain, skipping...\n", + perf_evsel__name(evsel)); + return 0; + } + + return perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts, + scripting_max_stack, trace->output); +} + static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) @@ -2193,21 +2215,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, fputc('\n', trace->output); - if (sample->callchain) { - struct addr_location al; - /* TODO: user-configurable print_opts */ - const unsigned int print_opts = PRINT_IP_OPT_SYM | - PRINT_IP_OPT_DSO | - PRINT_IP_OPT_UNKNOWN_AS_ADDR; - - if (machine__resolve(trace->host, &al, sample) < 0) { - pr_err("problem processing %d event, skipping it.\n", - event->header.type); - goto out_put; - } - perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts, - scripting_max_stack, trace->output); - } + trace__fprintf_callchain(trace, evsel, sample); out: ttrace->entry_pending = false; err = 0; @@ -2355,6 +2363,9 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, } fprintf(trace->output, ")\n"); + + trace__fprintf_callchain(trace, evsel, sample); + return 0; } -- GitLab From 3407df8bbc3a91d9aa4910130026ab6b3a261b87 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:24 +0200 Subject: [PATCH 232/705] perf thread_map: Add has() method Adding thread_map__has() to return bool of pid presence in threads map. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread_map.c | 12 ++++++++++++ tools/perf/util/thread_map.h | 1 + 2 files changed, 13 insertions(+) diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 267112b4e3dbe..878ac0687b0ae 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -436,3 +436,15 @@ struct thread_map *thread_map__new_event(struct thread_map_event *event) return threads; } + +bool thread_map__has(struct thread_map *threads, pid_t pid) +{ + int i; + + for (i = 0; i < threads->nr; ++i) { + if (threads->map[i].pid == pid) + return true; + } + + return false; +} diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h index 85e4c7c4fbde1..9a065ea69ff1d 100644 --- a/tools/perf/util/thread_map.h +++ b/tools/perf/util/thread_map.h @@ -55,4 +55,5 @@ static inline char *thread_map__comm(struct thread_map *map, int thread) } void thread_map__read_comms(struct thread_map *threads); +bool thread_map__has(struct thread_map *threads, pid_t pid); #endif /* __PERF_THREAD_MAP_H */ -- GitLab From e632aa69c919462a7f93c8799b97c8a9ddd48fc2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:25 +0200 Subject: [PATCH 233/705] perf cpu_map: Add has() method Adding cpu_map__has() to return bool of cpu presence in cpus map. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 12 ++++++++++++ tools/perf/util/cpumap.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 9bcf2bed3a6d1..02d801670f300 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -587,3 +587,15 @@ int cpu__setup_cpunode_map(void) closedir(dir1); return 0; } + +bool cpu_map__has(struct cpu_map *cpus, int cpu) +{ + int i; + + for (i = 0; i < cpus->nr; ++i) { + if (cpus->map[i] == cpu) + return true; + } + + return false; +} diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 81a2562aaa2b0..1a0a35073ce1e 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -66,4 +66,6 @@ int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, int (*f)(struct cpu_map *map, int cpu, void *data), void *data); + +bool cpu_map__has(struct cpu_map *cpus, int cpu); #endif /* __PERF_CPUMAP_H */ -- GitLab From 99623c628f5425f09b5321cf621af1da29c0c47d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:26 +0200 Subject: [PATCH 234/705] perf sched: Add compact display option Add compact map display that does not output the whole cpu matrix, only cpus that got event. $ perf sched map --compact *A0 1082427.094098 secs A0 => perf:19404 (CPU 2) A0 *. 1082427.094127 secs . => swapper:0 (CPU 1) A0 . *B0 1082427.094174 secs B0 => rcuos/2:25 (CPU 3) A0 . *. 1082427.094177 secs *C0 . . 1082427.094187 secs C0 => migration/2:21 C0 *A0 . 1082427.094193 secs *. A0 . 1082427.094195 secs *D0 A0 . 1082427.094402 secs D0 => rngd:968 *. A0 . 1082427.094406 secs . *E0 . 1082427.095221 secs E0 => kworker/1:1:5333 . E0 *F0 1082427.095227 secs F0 => xterm:3342 It helps to display sane output for small thread loads on big cpu servers. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-4-git-send-email-jolsa@kernel.org [ Add entry in 'perf sched' man page ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 7 +++ tools/perf/builtin-sched.c | 62 ++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 8ff4df9569512..89b0c5b7fe845 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -50,6 +50,13 @@ OPTIONS --dump-raw-trace=:: Display verbose dump of the sched data. +OPTIONS for 'perf sched map' +---------------------------- + +--compact:: + Show only CPUs with activity. Helps visualizing on high core + count systems. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 871b55ae22a41..64dd94667055d 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -122,6 +122,12 @@ struct trace_sched_handler { struct machine *machine); }; +struct perf_sched_map { + DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); + int *comp_cpus; + bool comp; +}; + struct perf_sched { struct perf_tool tool; const char *sort_order; @@ -173,6 +179,7 @@ struct perf_sched { struct list_head sort_list, cmp_pid; bool force; bool skip_merge; + struct perf_sched_map map; }; static u64 get_nsecs(void) @@ -1347,13 +1354,24 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, int new_shortname; u64 timestamp0, timestamp = sample->time; s64 delta; - int cpu, this_cpu = sample->cpu; + int i, this_cpu = sample->cpu; + int cpus_nr; + bool new_cpu = false; BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); if (this_cpu > sched->max_cpu) sched->max_cpu = this_cpu; + if (sched->map.comp) { + cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); + if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) { + sched->map.comp_cpus[cpus_nr++] = this_cpu; + new_cpu = true; + } + } else + cpus_nr = sched->max_cpu; + timestamp0 = sched->cpu_last_switched[this_cpu]; sched->cpu_last_switched[this_cpu] = timestamp; if (timestamp0) @@ -1400,7 +1418,9 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, new_shortname = 1; } - for (cpu = 0; cpu <= sched->max_cpu; cpu++) { + for (i = 0; i < cpus_nr; i++) { + int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; + if (cpu != this_cpu) printf(" "); else @@ -1414,12 +1434,15 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, printf(" %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { - printf("%s => %s:%d\n", + printf("%s => %s:%d", sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); - } else { - printf("\n"); } + if (sched->map.comp && new_cpu) + printf(" (CPU %d)", this_cpu); + + printf("\n"); + thread__put(sched_in); return 0; @@ -1675,9 +1698,22 @@ static int perf_sched__lat(struct perf_sched *sched) return 0; } +static int setup_map_cpus(struct perf_sched *sched) +{ + sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); + + if (sched->map.comp) { + sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); + return sched->map.comp_cpus ? 0 : -1; + } + + return 0; +} + static int perf_sched__map(struct perf_sched *sched) { - sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); + if (setup_map_cpus(sched)) + return -1; setup_pager(); if (perf_sched__read_events(sched)) @@ -1831,6 +1867,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "dump raw trace in ASCII"), OPT_END() }; + const struct option map_options[] = { + OPT_BOOLEAN(0, "compact", &sched.map.comp, + "map output in compact mode"), + OPT_END() + }; const char * const latency_usage[] = { "perf sched latency []", NULL @@ -1839,6 +1880,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "perf sched replay []", NULL }; + const char * const map_usage[] = { + "perf sched map []", + NULL + }; const char *const sched_subcommands[] = { "record", "latency", "map", "replay", "script", NULL }; const char *sched_usage[] = { @@ -1887,6 +1932,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) setup_sorting(&sched, latency_options, latency_usage); return perf_sched__lat(&sched); } else if (!strcmp(argv[0], "map")) { + if (argc) { + argc = parse_options(argc, argv, map_options, replay_usage, 0); + if (argc) + usage_with_options(map_usage, map_options); + } sched.tp_handler = &map_ops; setup_sorting(&sched, latency_options, latency_usage); return perf_sched__map(&sched); -- GitLab From 8cd91195e5efc5166fc48eec6cf83ef93133b7b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:27 +0200 Subject: [PATCH 235/705] perf sched: Use color_fprintf for output As preparation for next patch. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-5-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 64dd94667055d..9ef28973f1983 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -11,6 +11,7 @@ #include "util/session.h" #include "util/tool.h" #include "util/cloexec.h" +#include "util/color.h" #include #include "util/trace-event.h" @@ -1357,6 +1358,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, int i, this_cpu = sample->cpu; int cpus_nr; bool new_cpu = false; + const char *color = PERF_COLOR_NORMAL; BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); @@ -1422,26 +1424,26 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; if (cpu != this_cpu) - printf(" "); + color_fprintf(stdout, color, " "); else - printf("*"); + color_fprintf(stdout, color, "*"); if (sched->curr_thread[cpu]) - printf("%2s ", sched->curr_thread[cpu]->shortname); + color_fprintf(stdout, color, "%2s ", sched->curr_thread[cpu]->shortname); else - printf(" "); + color_fprintf(stdout, color, " "); } - printf(" %12.6f secs ", (double)timestamp/1e9); + color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { - printf("%s => %s:%d", + color_fprintf(stdout, color, "%s => %s:%d", sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); } if (sched->map.comp && new_cpu) - printf(" (CPU %d)", this_cpu); + color_fprintf(stdout, color, " (CPU %d)", this_cpu); - printf("\n"); + color_fprintf(stdout, color, "\n"); thread__put(sched_in); -- GitLab From 097be0f5034fc9edaf84253b773b14bc2af9a708 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:28 +0200 Subject: [PATCH 236/705] perf thread_map: Make new_by_tid_str constructor public It will be used in following patch. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-6-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread_map.c | 2 +- tools/perf/util/thread_map.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 878ac0687b0ae..5654fe15e0367 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -260,7 +260,7 @@ struct thread_map *thread_map__new_dummy(void) return threads; } -static struct thread_map *thread_map__new_by_tid_str(const char *tid_str) +struct thread_map *thread_map__new_by_tid_str(const char *tid_str) { struct thread_map *threads = NULL, *nt; int ntasks = 0; diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h index 9a065ea69ff1d..bd3b971588da5 100644 --- a/tools/perf/util/thread_map.h +++ b/tools/perf/util/thread_map.h @@ -31,6 +31,8 @@ void thread_map__put(struct thread_map *map); struct thread_map *thread_map__new_str(const char *pid, const char *tid, uid_t uid); +struct thread_map *thread_map__new_by_tid_str(const char *tid_str); + size_t thread_map__fprintf(struct thread_map *threads, FILE *fp); static inline int thread_map__nr(struct thread_map *threads) -- GitLab From a151a37a760aab41c115af8d5016e449228e8d2e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:29 +0200 Subject: [PATCH 237/705] perf sched map: Color given pids Adding --color-pids option to display selected pids in color (blue by default). It helps on navigating through the 'perf sched map' output. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-7-git-send-email-jolsa@kernel.org [ Added entry to man page ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 3 + tools/perf/builtin-sched.c | 77 +++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 89b0c5b7fe845..67913de3aee74 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -57,6 +57,9 @@ OPTIONS for 'perf sched map' Show only CPUs with activity. Helps visualizing on high core count systems. +--color-pids:: + Highlight the given pids. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 9ef28973f1983..b5361a1d20e14 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -11,6 +11,7 @@ #include "util/session.h" #include "util/tool.h" #include "util/cloexec.h" +#include "util/thread_map.h" #include "util/color.h" #include @@ -123,10 +124,14 @@ struct trace_sched_handler { struct machine *machine); }; +#define COLOR_PIDS PERF_COLOR_BLUE + struct perf_sched_map { DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); int *comp_cpus; bool comp; + struct thread_map *color_pids; + const char *color_pids_str; }; struct perf_sched { @@ -1347,6 +1352,38 @@ static int process_sched_wakeup_event(struct perf_tool *tool, return 0; } +union map_priv { + void *ptr; + bool color; +}; + +static bool thread__has_color(struct thread *thread) +{ + union map_priv priv = { + .ptr = thread__priv(thread), + }; + + return priv.color; +} + +static struct thread* +map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) +{ + struct thread *thread = machine__findnew_thread(machine, pid, tid); + union map_priv priv = { + .color = false, + }; + + if (!sched->map.color_pids || !thread || thread__priv(thread)) + return thread; + + if (thread_map__has(sched->map.color_pids, tid)) + priv.color = true; + + thread__set_priv(thread, priv.ptr); + return thread; +} + static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, struct perf_sample *sample, struct machine *machine) { @@ -1386,7 +1423,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, return -1; } - sched_in = machine__findnew_thread(machine, -1, next_pid); + sched_in = map__findnew_thread(sched, machine, -1, next_pid); if (sched_in == NULL) return -1; @@ -1422,6 +1459,11 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, for (i = 0; i < cpus_nr; i++) { int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; + struct thread *curr_thread = sched->curr_thread[cpu]; + const char *pid_color = color; + + if (curr_thread && thread__has_color(curr_thread)) + pid_color = COLOR_PIDS; if (cpu != this_cpu) color_fprintf(stdout, color, " "); @@ -1429,14 +1471,19 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, color_fprintf(stdout, color, "*"); if (sched->curr_thread[cpu]) - color_fprintf(stdout, color, "%2s ", sched->curr_thread[cpu]->shortname); + color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname); else color_fprintf(stdout, color, " "); } color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { - color_fprintf(stdout, color, "%s => %s:%d", + const char *pid_color = color; + + if (thread__has_color(sched_in)) + pid_color = COLOR_PIDS; + + color_fprintf(stdout, pid_color, "%s => %s:%d", sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); } @@ -1712,11 +1759,31 @@ static int setup_map_cpus(struct perf_sched *sched) return 0; } +static int setup_color_pids(struct perf_sched *sched) +{ + struct thread_map *map; + + if (!sched->map.color_pids_str) + return 0; + + map = thread_map__new_by_tid_str(sched->map.color_pids_str); + if (!map) { + pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); + return -1; + } + + sched->map.color_pids = map; + return 0; +} + static int perf_sched__map(struct perf_sched *sched) { if (setup_map_cpus(sched)) return -1; + if (setup_color_pids(sched)) + return -1; + setup_pager(); if (perf_sched__read_events(sched)) return -1; @@ -1872,6 +1939,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) const struct option map_options[] = { OPT_BOOLEAN(0, "compact", &sched.map.comp, "map output in compact mode"), + OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", + "highlight given pids in map"), OPT_END() }; const char * const latency_usage[] = { @@ -1935,7 +2004,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) return perf_sched__lat(&sched); } else if (!strcmp(argv[0], "map")) { if (argc) { - argc = parse_options(argc, argv, map_options, replay_usage, 0); + argc = parse_options(argc, argv, map_options, map_usage, 0); if (argc) usage_with_options(map_usage, map_options); } -- GitLab From cf294f24f8c83bca6aa8e96b5cc4f78bed887f92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:30 +0200 Subject: [PATCH 238/705] perf sched map: Color given cpus Adding --color-cpus option to display selected cpus with background color (red by default). It helps on navigating through the perf sched map output. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-8-git-send-email-jolsa@kernel.org [ Added entry to man page ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 3 +++ tools/perf/builtin-sched.c | 36 ++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 67913de3aee74..58bff6cbc3f37 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -57,6 +57,9 @@ OPTIONS for 'perf sched map' Show only CPUs with activity. Helps visualizing on high core count systems. +--color-cpus:: + Highlight the given cpus. + --color-pids:: Highlight the given pids. diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b5361a1d20e14..7de04b297c145 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -125,6 +125,7 @@ struct trace_sched_handler { }; #define COLOR_PIDS PERF_COLOR_BLUE +#define COLOR_CPUS PERF_COLOR_BG_RED struct perf_sched_map { DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); @@ -132,6 +133,8 @@ struct perf_sched_map { bool comp; struct thread_map *color_pids; const char *color_pids_str; + struct cpu_map *color_cpus; + const char *color_cpus_str; }; struct perf_sched { @@ -1461,14 +1464,18 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; struct thread *curr_thread = sched->curr_thread[cpu]; const char *pid_color = color; + const char *cpu_color = color; if (curr_thread && thread__has_color(curr_thread)) pid_color = COLOR_PIDS; + if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) + cpu_color = COLOR_CPUS; + if (cpu != this_cpu) - color_fprintf(stdout, color, " "); + color_fprintf(stdout, cpu_color, " "); else - color_fprintf(stdout, color, "*"); + color_fprintf(stdout, cpu_color, "*"); if (sched->curr_thread[cpu]) color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname); @@ -1753,7 +1760,8 @@ static int setup_map_cpus(struct perf_sched *sched) if (sched->map.comp) { sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); - return sched->map.comp_cpus ? 0 : -1; + if (!sched->map.comp_cpus) + return -1; } return 0; @@ -1776,6 +1784,23 @@ static int setup_color_pids(struct perf_sched *sched) return 0; } +static int setup_color_cpus(struct perf_sched *sched) +{ + struct cpu_map *map; + + if (!sched->map.color_cpus_str) + return 0; + + map = cpu_map__new(sched->map.color_cpus_str); + if (!map) { + pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); + return -1; + } + + sched->map.color_cpus = map; + return 0; +} + static int perf_sched__map(struct perf_sched *sched) { if (setup_map_cpus(sched)) @@ -1784,6 +1809,9 @@ static int perf_sched__map(struct perf_sched *sched) if (setup_color_pids(sched)) return -1; + if (setup_color_cpus(sched)) + return -1; + setup_pager(); if (perf_sched__read_events(sched)) return -1; @@ -1941,6 +1969,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "map output in compact mode"), OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", "highlight given pids in map"), + OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", + "highlight given CPUs in map"), OPT_END() }; const char * const latency_usage[] = { -- GitLab From 73643bb6a21c85509c7ae4c316f502c5a19cce65 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 Apr 2016 15:29:31 +0200 Subject: [PATCH 239/705] perf sched map: Display only given cpus Introducing --cpus option that will display only given cpus. Could be used together with color-cpus option. $ perf sched map --cpus 0,1 *A0 309999.786924 secs A0 => rcu_sched:7 *. 309999.786930 secs *B0 . 309999.786931 secs B0 => rcuos/2:25 B0 *A0 309999.786947 secs Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460467771-26532-9-git-send-email-jolsa@kernel.org [ Added entry to man page ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 3 +++ tools/perf/builtin-sched.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 58bff6cbc3f37..1cc08cc47ac53 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -57,6 +57,9 @@ OPTIONS for 'perf sched map' Show only CPUs with activity. Helps visualizing on high core count systems. +--cpus:: + Show just entries with activities for the given CPUs. + --color-cpus:: Highlight the given cpus. diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 7de04b297c145..afa057666c2ad 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -135,6 +135,8 @@ struct perf_sched_map { const char *color_pids_str; struct cpu_map *color_cpus; const char *color_cpus_str; + struct cpu_map *cpus; + const char *cpus_str; }; struct perf_sched { @@ -1469,6 +1471,9 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, if (curr_thread && thread__has_color(curr_thread)) pid_color = COLOR_PIDS; + if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu)) + continue; + if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) cpu_color = COLOR_CPUS; @@ -1483,6 +1488,9 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, color_fprintf(stdout, color, " "); } + if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu)) + goto out; + color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { const char *pid_color = color; @@ -1497,6 +1505,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, if (sched->map.comp && new_cpu) color_fprintf(stdout, color, " (CPU %d)", this_cpu); +out: color_fprintf(stdout, color, "\n"); thread__put(sched_in); @@ -1756,6 +1765,8 @@ static int perf_sched__lat(struct perf_sched *sched) static int setup_map_cpus(struct perf_sched *sched) { + struct cpu_map *map; + sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); if (sched->map.comp) { @@ -1764,6 +1775,16 @@ static int setup_map_cpus(struct perf_sched *sched) return -1; } + if (!sched->map.cpus_str) + return 0; + + map = cpu_map__new(sched->map.cpus_str); + if (!map) { + pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); + return -1; + } + + sched->map.cpus = map; return 0; } @@ -1971,6 +1992,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "highlight given pids in map"), OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", "highlight given CPUs in map"), + OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", + "display given CPUs in map"), OPT_END() }; const char * const latency_usage[] = { -- GitLab From e20ab86e51218f9949f41fb39a6c4f63b662f135 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Apr 2016 15:16:15 -0300 Subject: [PATCH 240/705] perf evsel: Move some methods from session.[ch] to evsel.[ch] Those were converted to be evsel methods long ago, move the source to where it belongs. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-vja8rjmkw3gd5ungaeyb5s2j@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 14 ++-- tools/perf/builtin-trace.c | 6 +- tools/perf/util/evsel.c | 131 ++++++++++++++++++++++++++++++++++++ tools/perf/util/evsel.h | 13 ++++ tools/perf/util/session.c | 130 ----------------------------------- tools/perf/util/session.h | 13 ---- 6 files changed, 154 insertions(+), 153 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ddd5b79e94c27..838c0bc38105f 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -317,19 +317,19 @@ static void set_print_ip_opts(struct perf_event_attr *attr) output[type].print_ip_opts = 0; if (PRINT_FIELD(IP)) - output[type].print_ip_opts |= PRINT_IP_OPT_IP; + output[type].print_ip_opts |= EVSEL__PRINT_IP; if (PRINT_FIELD(SYM)) - output[type].print_ip_opts |= PRINT_IP_OPT_SYM; + output[type].print_ip_opts |= EVSEL__PRINT_SYM; if (PRINT_FIELD(DSO)) - output[type].print_ip_opts |= PRINT_IP_OPT_DSO; + output[type].print_ip_opts |= EVSEL__PRINT_DSO; if (PRINT_FIELD(SYMOFFSET)) - output[type].print_ip_opts |= PRINT_IP_OPT_SYMOFFSET; + output[type].print_ip_opts |= EVSEL__PRINT_SYMOFFSET; if (PRINT_FIELD(SRCLINE)) - output[type].print_ip_opts |= PRINT_IP_OPT_SRCLINE; + output[type].print_ip_opts |= EVSEL__PRINT_SRCLINE; } /* @@ -574,9 +574,9 @@ static void print_sample_bts(struct perf_sample *sample, printf("\n"); } else { printf(" "); - if (print_opts & PRINT_IP_OPT_SRCLINE) { + if (print_opts & EVSEL__PRINT_SRCLINE) { print_srcline_last = true; - print_opts &= ~PRINT_IP_OPT_SRCLINE; + print_opts &= ~EVSEL__PRINT_SRCLINE; } } perf_evsel__fprintf_sym(evsel, sample, al, 0, print_opts, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a6e05e1bb3503..b842ddd3ad0cf 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2119,9 +2119,9 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evse { struct addr_location al; /* TODO: user-configurable print_opts */ - const unsigned int print_opts = PRINT_IP_OPT_SYM | - PRINT_IP_OPT_DSO | - PRINT_IP_OPT_UNKNOWN_AS_ADDR; + const unsigned int print_opts = EVSEL__PRINT_SYM | + EVSEL__PRINT_DSO | + EVSEL__PRINT_UNKNOWN_AS_ADDR; if (sample->callchain == NULL) return 0; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d475a4ec8b570..6e86598682be4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2343,6 +2343,137 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, return ++printed; } +int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) +{ + int printed = 0; + struct callchain_cursor_node *node; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_oneline = print_opts & EVSEL__PRINT_ONELINE; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + char s = print_oneline ? ' ' : '\t'; + + if (sample->callchain) { + struct addr_location node_al; + + if (thread__resolve_callchain(al->thread, evsel, + sample, NULL, NULL, + stack_depth) != 0) { + if (verbose) + error("Failed to resolve callchain. Skipping\n"); + return printed; + } + callchain_cursor_commit(&callchain_cursor); + + if (print_symoffset) + node_al = *al; + + while (stack_depth) { + u64 addr = 0; + + node = callchain_cursor_current(&callchain_cursor); + if (!node) + break; + + if (node->sym && node->sym->ignore) + goto next; + + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); + + if (node->map) + addr = node->map->map_ip(node->map, node->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + node_al.addr = addr; + node_al.map = node->map; + + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(node->sym, &node_al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(node->sym, &node_al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(node->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(node->map, addr, "\n ", fp); + + if (!print_oneline) + printed += fprintf(fp, "\n"); + + stack_depth--; +next: + callchain_cursor_advance(&callchain_cursor); + } + } + + return printed; +} + +int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp) +{ + int printed = 0; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + + if (symbol_conf.use_callchain && sample->callchain) { + printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, + print_opts, stack_depth, fp); + } else if (!(al->sym && al->sym->ignore)) { + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%16" PRIx64, sample->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(al->sym, al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(al->sym, al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(al->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); + } + + return printed; +} + + bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1bd6c2e02dfa1..36edd3c91d5c0 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -387,12 +387,25 @@ struct perf_attr_details { int perf_evsel__fprintf(struct perf_evsel *evsel, struct perf_attr_details *details, FILE *fp); +#define EVSEL__PRINT_IP (1<<0) +#define EVSEL__PRINT_SYM (1<<1) +#define EVSEL__PRINT_DSO (1<<2) +#define EVSEL__PRINT_SYMOFFSET (1<<3) +#define EVSEL__PRINT_ONELINE (1<<4) +#define EVSEL__PRINT_SRCLINE (1<<5) +#define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) + int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample *sample, struct addr_location *al, int left_alignment, unsigned int print_opts, unsigned int stack_depth, FILE *fp); +int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, unsigned int stack_depth, + FILE *fp); + bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize); int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0516d06a2741a..91d4528d71fa8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1953,136 +1953,6 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) -{ - int printed = 0; - struct callchain_cursor_node *node; - int print_ip = print_opts & PRINT_IP_OPT_IP; - int print_sym = print_opts & PRINT_IP_OPT_SYM; - int print_dso = print_opts & PRINT_IP_OPT_DSO; - int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; - int print_oneline = print_opts & PRINT_IP_OPT_ONELINE; - int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; - int print_unknown_as_addr = print_opts & PRINT_IP_OPT_UNKNOWN_AS_ADDR; - char s = print_oneline ? ' ' : '\t'; - - if (sample->callchain) { - struct addr_location node_al; - - if (thread__resolve_callchain(al->thread, evsel, - sample, NULL, NULL, - stack_depth) != 0) { - if (verbose) - error("Failed to resolve callchain. Skipping\n"); - return printed; - } - callchain_cursor_commit(&callchain_cursor); - - if (print_symoffset) - node_al = *al; - - while (stack_depth) { - u64 addr = 0; - - node = callchain_cursor_current(&callchain_cursor); - if (!node) - break; - - if (node->sym && node->sym->ignore) - goto next; - - printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); - - if (print_ip) - printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); - - if (node->map) - addr = node->map->map_ip(node->map, node->ip); - - if (print_sym) { - printed += fprintf(fp, " "); - node_al.addr = addr; - node_al.map = node->map; - - if (print_symoffset) { - printed += __symbol__fprintf_symname_offs(node->sym, &node_al, - print_unknown_as_addr, fp); - } else { - printed += __symbol__fprintf_symname(node->sym, &node_al, - print_unknown_as_addr, fp); - } - } - - if (print_dso) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(node->map, fp); - printed += fprintf(fp, ")"); - } - - if (print_srcline) - printed += map__fprintf_srcline(node->map, addr, "\n ", fp); - - if (!print_oneline) - printed += fprintf(fp, "\n"); - - stack_depth--; -next: - callchain_cursor_advance(&callchain_cursor); - } - } - - return printed; -} - -int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) -{ - int printed = 0; - int print_ip = print_opts & PRINT_IP_OPT_IP; - int print_sym = print_opts & PRINT_IP_OPT_SYM; - int print_dso = print_opts & PRINT_IP_OPT_DSO; - int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; - int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; - int print_unknown_as_addr = print_opts & PRINT_IP_OPT_UNKNOWN_AS_ADDR; - - if (symbol_conf.use_callchain && sample->callchain) { - printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, - print_opts, stack_depth, fp); - } else if (!(al->sym && al->sym->ignore)) { - printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); - - if (print_ip) - printed += fprintf(fp, "%16" PRIx64, sample->ip); - - if (print_sym) { - printed += fprintf(fp, " "); - if (print_symoffset) { - printed += __symbol__fprintf_symname_offs(al->sym, al, - print_unknown_as_addr, fp); - } else { - printed += __symbol__fprintf_symname(al->sym, al, - print_unknown_as_addr, fp); - } - } - - if (print_dso) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(al->map, fp); - printed += fprintf(fp, ")"); - } - - if (print_srcline) - printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - } - - return printed; -} - int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap) { diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 4257fac566186..4bd758553450c 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -36,14 +36,6 @@ struct perf_session { struct perf_tool *tool; }; -#define PRINT_IP_OPT_IP (1<<0) -#define PRINT_IP_OPT_SYM (1<<1) -#define PRINT_IP_OPT_DSO (1<<2) -#define PRINT_IP_OPT_SYMOFFSET (1<<3) -#define PRINT_IP_OPT_ONELINE (1<<4) -#define PRINT_IP_OPT_SRCLINE (1<<5) -#define PRINT_IP_OPT_UNKNOWN_AS_ADDR (1<<6) - struct perf_tool; struct perf_session *perf_session__new(struct perf_data_file *file, @@ -105,11 +97,6 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp); - int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); -- GitLab From 59247e33ff494e3643cdff54b64bf72575052b76 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 12 Apr 2016 16:05:02 -0300 Subject: [PATCH 241/705] perf trace: Do not accept --no-syscalls together with -e Doesn't make sense and was causing a segfault, fix it. # trace -e clone --no-syscalls --event sched:*exec firefox The -e option can't be used with --no-syscalls. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ccrahezikdk2uebptzr1eyyi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b842ddd3ad0cf..d49c131bb5de1 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3344,6 +3344,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) goto out; } + err = -1; + if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; @@ -3368,6 +3370,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) return -1; } + if (!trace.trace_syscalls && ev_qualifier_str) { + pr_err("The -e option can't be used with --no-syscalls.\n"); + goto out; + } + if (output_name != NULL) { err = trace__open_output(&trace, output_name); if (err < 0) { -- GitLab From 6fb35b9515c6300cb25022ac74f5f5617a349e18 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Apr 2016 11:50:23 -0300 Subject: [PATCH 242/705] perf trace: Add seccomp beautifier related defines for older systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Were the detached tarball (make perf-tar-src-pkg) build was failing because those definitions aren't available in the system headers. On RHEL7, for instance: builtin-trace.c: In function ‘syscall_arg__scnprintf_seccomp_op’: builtin-trace.c:1069:7: error: ‘SECCOMP_SET_MODE_STRICT’ undeclared (first use in this function) P_SECCOMP_SET_MODE_OP(STRICT); ^ builtin-trace.c:1069:7: note: each undeclared identifier is reported only once for each function it appears in builtin-trace.c:1070:7: error: ‘SECCOMP_SET_MODE_FILTER’ undeclared (first use in this function) P_SECCOMP_SET_MODE_OP(FILTER); ^ builtin-trace.c: In function ‘syscall_arg__scnprintf_seccomp_flags’: builtin-trace.c:1091:14: error: ‘SECCOMP_FILTER_FLAG_TSYNC’ undeclared (first use in this function) P_FLAG(TSYNC); ^ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-4f8dzzwd7g6l5dzz693u7kul@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d49c131bb5de1..246866c63e5ec 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1059,6 +1059,13 @@ static const char *tioctls[] = { static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); #endif /* defined(__i386__) || defined(__x86_64__) */ +#ifndef SECCOMP_SET_MODE_STRICT +#define SECCOMP_SET_MODE_STRICT 0 +#endif +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg) { int op = arg->val; @@ -1077,6 +1084,10 @@ static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct sy #define SCA_SECCOMP_OP syscall_arg__scnprintf_seccomp_op +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size, struct syscall_arg *arg) { -- GitLab From a355a61e43484ac02553b34b5f84ee84ca765fd6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Apr 2016 11:55:18 -0300 Subject: [PATCH 243/705] perf trace: Add getrandom beautifier related defines for older systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Were the detached tarball (make perf-tar-src-pkg) build was failing because those definitions aren't available in the system headers. On RHEL7, for instance: builtin-trace.c: In function ‘syscall_arg__scnprintf_getrandom_flags’: builtin-trace.c:1113:14: error: ‘GRND_RANDOM’ undeclared (first use in this function) P_FLAG(RANDOM); ^ builtin-trace.c:1114:14: error: ‘GRND_NONBLOCK’ undeclared (first use in this function) P_FLAG(NONBLOCK); ^ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-r8496g24a3kbqynvk6617b0e@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 246866c63e5ec..653d4c7422e91 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1110,6 +1110,13 @@ static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size, #define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags +#ifndef GRND_NONBLOCK +#define GRND_NONBLOCK 0x0001 +#endif +#ifndef GRND_RANDOM +#define GRND_RANDOM 0x0002 +#endif + static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, struct syscall_arg *arg) { -- GitLab From df4cb1678e2ea91eb66665f00c4a43d898a12697 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Apr 2016 12:05:44 -0300 Subject: [PATCH 244/705] perf trace: Move mmap beautifiers to trace/beauty/ directory To better organize all these beautifiers. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-zbr27mdy9ssdhux3ib2nfa7j@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 159 +-------------------------------- tools/perf/trace/beauty/mmap.c | 158 ++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 158 deletions(-) create mode 100644 tools/perf/trace/beauty/mmap.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 653d4c7422e91..abd5a94f5dbeb 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -39,7 +39,6 @@ #include /* FIXME: Still needed for audit_errno_to_name */ #include -#include #include #include #include @@ -49,22 +48,6 @@ #include /* For older distros: */ -#ifndef MAP_STACK -# define MAP_STACK 0x20000 -#endif - -#ifndef MADV_HWPOISON -# define MADV_HWPOISON 100 - -#endif - -#ifndef MADV_MERGEABLE -# define MADV_MERGEABLE 12 -#endif - -#ifndef MADV_UNMERGEABLE -# define MADV_UNMERGEABLE 13 -#endif #ifndef EFD_SEMAPHORE # define EFD_SEMAPHORE 1 @@ -429,147 +412,6 @@ static size_t syscall_arg__scnprintf_int(char *bf, size_t size, #define SCA_INT syscall_arg__scnprintf_int -static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, prot = arg->val; - - if (prot == PROT_NONE) - return scnprintf(bf, size, "NONE"); -#define P_MMAP_PROT(n) \ - if (prot & PROT_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - prot &= ~PROT_##n; \ - } - - P_MMAP_PROT(EXEC); - P_MMAP_PROT(READ); - P_MMAP_PROT(WRITE); -#ifdef PROT_SEM - P_MMAP_PROT(SEM); -#endif - P_MMAP_PROT(GROWSDOWN); - P_MMAP_PROT(GROWSUP); -#undef P_MMAP_PROT - - if (prot) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot); - - return printed; -} - -#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot - -static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - -#define P_MMAP_FLAG(n) \ - if (flags & MAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MAP_##n; \ - } - - P_MMAP_FLAG(SHARED); - P_MMAP_FLAG(PRIVATE); -#ifdef MAP_32BIT - P_MMAP_FLAG(32BIT); -#endif - P_MMAP_FLAG(ANONYMOUS); - P_MMAP_FLAG(DENYWRITE); - P_MMAP_FLAG(EXECUTABLE); - P_MMAP_FLAG(FILE); - P_MMAP_FLAG(FIXED); - P_MMAP_FLAG(GROWSDOWN); -#ifdef MAP_HUGETLB - P_MMAP_FLAG(HUGETLB); -#endif - P_MMAP_FLAG(LOCKED); - P_MMAP_FLAG(NONBLOCK); - P_MMAP_FLAG(NORESERVE); - P_MMAP_FLAG(POPULATE); - P_MMAP_FLAG(STACK); -#ifdef MAP_UNINITIALIZED - P_MMAP_FLAG(UNINITIALIZED); -#endif -#undef P_MMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags - -static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - -#define P_MREMAP_FLAG(n) \ - if (flags & MREMAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MREMAP_##n; \ - } - - P_MREMAP_FLAG(MAYMOVE); -#ifdef MREMAP_FIXED - P_MREMAP_FLAG(FIXED); -#endif -#undef P_MREMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags - -static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, - struct syscall_arg *arg) -{ - int behavior = arg->val; - - switch (behavior) { -#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) - P_MADV_BHV(NORMAL); - P_MADV_BHV(RANDOM); - P_MADV_BHV(SEQUENTIAL); - P_MADV_BHV(WILLNEED); - P_MADV_BHV(DONTNEED); - P_MADV_BHV(REMOVE); - P_MADV_BHV(DONTFORK); - P_MADV_BHV(DOFORK); - P_MADV_BHV(HWPOISON); -#ifdef MADV_SOFT_OFFLINE - P_MADV_BHV(SOFT_OFFLINE); -#endif - P_MADV_BHV(MERGEABLE); - P_MADV_BHV(UNMERGEABLE); -#ifdef MADV_HUGEPAGE - P_MADV_BHV(HUGEPAGE); -#endif -#ifdef MADV_NOHUGEPAGE - P_MADV_BHV(NOHUGEPAGE); -#endif -#ifdef MADV_DONTDUMP - P_MADV_BHV(DONTDUMP); -#endif -#ifdef MADV_DODUMP - P_MADV_BHV(DODUMP); -#endif -#undef P_MADV_PHV - default: break; - } - - return scnprintf(bf, size, "%#x", behavior); -} - -#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior - static size_t syscall_arg__scnprintf_flock(char *bf, size_t size, struct syscall_arg *arg) { @@ -1145,6 +987,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_parm = { [arg] = &strarray__##array, } #include "trace/beauty/pid.c" +#include "trace/beauty/mmap.c" #include "trace/beauty/mode_t.c" #include "trace/beauty/sched_policy.c" #include "trace/beauty/waitid_options.c" diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c new file mode 100644 index 0000000000000..3444a4d5382d1 --- /dev/null +++ b/tools/perf/trace/beauty/mmap.c @@ -0,0 +1,158 @@ +#include + +static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, prot = arg->val; + + if (prot == PROT_NONE) + return scnprintf(bf, size, "NONE"); +#define P_MMAP_PROT(n) \ + if (prot & PROT_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + prot &= ~PROT_##n; \ + } + + P_MMAP_PROT(EXEC); + P_MMAP_PROT(READ); + P_MMAP_PROT(WRITE); +#ifdef PROT_SEM + P_MMAP_PROT(SEM); +#endif + P_MMAP_PROT(GROWSDOWN); + P_MMAP_PROT(GROWSUP); +#undef P_MMAP_PROT + + if (prot) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot); + + return printed; +} + +#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot + +#ifndef MAP_STACK +# define MAP_STACK 0x20000 +#endif + +static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_MMAP_FLAG(n) \ + if (flags & MAP_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~MAP_##n; \ + } + + P_MMAP_FLAG(SHARED); + P_MMAP_FLAG(PRIVATE); +#ifdef MAP_32BIT + P_MMAP_FLAG(32BIT); +#endif + P_MMAP_FLAG(ANONYMOUS); + P_MMAP_FLAG(DENYWRITE); + P_MMAP_FLAG(EXECUTABLE); + P_MMAP_FLAG(FILE); + P_MMAP_FLAG(FIXED); + P_MMAP_FLAG(GROWSDOWN); +#ifdef MAP_HUGETLB + P_MMAP_FLAG(HUGETLB); +#endif + P_MMAP_FLAG(LOCKED); + P_MMAP_FLAG(NONBLOCK); + P_MMAP_FLAG(NORESERVE); + P_MMAP_FLAG(POPULATE); + P_MMAP_FLAG(STACK); +#ifdef MAP_UNINITIALIZED + P_MMAP_FLAG(UNINITIALIZED); +#endif +#undef P_MMAP_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags + +static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_MREMAP_FLAG(n) \ + if (flags & MREMAP_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~MREMAP_##n; \ + } + + P_MREMAP_FLAG(MAYMOVE); +#ifdef MREMAP_FIXED + P_MREMAP_FLAG(FIXED); +#endif +#undef P_MREMAP_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags + +#ifndef MADV_HWPOISON +#define MADV_HWPOISON 100 +#endif + +#ifndef MADV_MERGEABLE +#define MADV_MERGEABLE 12 +#endif + +#ifndef MADV_UNMERGEABLE +#define MADV_UNMERGEABLE 13 +#endif + +static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, + struct syscall_arg *arg) +{ + int behavior = arg->val; + + switch (behavior) { +#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) + P_MADV_BHV(NORMAL); + P_MADV_BHV(RANDOM); + P_MADV_BHV(SEQUENTIAL); + P_MADV_BHV(WILLNEED); + P_MADV_BHV(DONTNEED); + P_MADV_BHV(REMOVE); + P_MADV_BHV(DONTFORK); + P_MADV_BHV(DOFORK); + P_MADV_BHV(HWPOISON); +#ifdef MADV_SOFT_OFFLINE + P_MADV_BHV(SOFT_OFFLINE); +#endif + P_MADV_BHV(MERGEABLE); + P_MADV_BHV(UNMERGEABLE); +#ifdef MADV_HUGEPAGE + P_MADV_BHV(HUGEPAGE); +#endif +#ifdef MADV_NOHUGEPAGE + P_MADV_BHV(NOHUGEPAGE); +#endif +#ifdef MADV_DONTDUMP + P_MADV_BHV(DONTDUMP); +#endif +#ifdef MADV_DODUMP + P_MADV_BHV(DODUMP); +#endif +#undef P_MADV_PHV + default: break; + } + + return scnprintf(bf, size, "%#x", behavior); +} + +#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior -- GitLab From ea8dc3cefba0a0decaedc710b218a6ceffe0194a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 13 Apr 2016 12:10:19 -0300 Subject: [PATCH 245/705] perf trace: Move eventfd beautifiers to trace/beauty/ directory To better organize all these beautifiers. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-zrw5zz7cnrs44o5osouyutvt@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 41 +------------------------------ tools/perf/trace/beauty/eventfd.c | 38 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 40 deletions(-) create mode 100644 tools/perf/trace/beauty/eventfd.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index abd5a94f5dbeb..8e090a785c5e5 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -47,20 +47,6 @@ #include #include -/* For older distros: */ - -#ifndef EFD_SEMAPHORE -# define EFD_SEMAPHORE 1 -#endif - -#ifndef EFD_NONBLOCK -# define EFD_NONBLOCK 00004000 -#endif - -#ifndef EFD_CLOEXEC -# define EFD_CLOEXEC 02000000 -#endif - #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif @@ -772,32 +758,6 @@ static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size, #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags -static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (flags == 0) - return scnprintf(bf, size, "NONE"); -#define P_FLAG(n) \ - if (flags & EFD_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~EFD_##n; \ - } - - P_FLAG(SEMAPHORE); - P_FLAG(CLOEXEC); - P_FLAG(NONBLOCK); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags - static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -986,6 +946,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } +#include "trace/beauty/eventfd.c" #include "trace/beauty/pid.c" #include "trace/beauty/mmap.c" #include "trace/beauty/mode_t.c" diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c new file mode 100644 index 0000000000000..d64f4a9128a1c --- /dev/null +++ b/tools/perf/trace/beauty/eventfd.c @@ -0,0 +1,38 @@ +#include + +#ifndef EFD_SEMAPHORE +#define EFD_SEMAPHORE 1 +#endif + +#ifndef EFD_NONBLOCK +#define EFD_NONBLOCK 00004000 +#endif + +#ifndef EFD_CLOEXEC +#define EFD_CLOEXEC 02000000 +#endif + +static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (flags == 0) + return scnprintf(bf, size, "NONE"); +#define P_FLAG(n) \ + if (flags & EFD_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~EFD_##n; \ + } + + P_FLAG(SEMAPHORE); + P_FLAG(CLOEXEC); + P_FLAG(NONBLOCK); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags -- GitLab From 4532f642974d871f9a50e9a09bc482eaed5394f6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Apr 2016 08:21:04 +0000 Subject: [PATCH 246/705] perf ordered_events: Introduce reinit() 'perf record' will use this when outputting multiple perf.data files. Signed-off-by: Wang Nan Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460535673-159866-2-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang [ Split from larger patch ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/ordered-events.c | 9 +++++++++ tools/perf/util/ordered-events.h | 1 + 2 files changed, 10 insertions(+) diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index b1b9e2385f4b3..fe84df1875aa9 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -308,3 +308,12 @@ void ordered_events__free(struct ordered_events *oe) free(event); } } + +void ordered_events__reinit(struct ordered_events *oe) +{ + ordered_events__deliver_t old_deliver = oe->deliver; + + ordered_events__free(oe); + memset(oe, '\0', sizeof(*oe)); + ordered_events__init(oe, old_deliver); +} diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h index f403991e3bfd8..e11468a9a6e40 100644 --- a/tools/perf/util/ordered-events.h +++ b/tools/perf/util/ordered-events.h @@ -49,6 +49,7 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve int ordered_events__flush(struct ordered_events *oe, enum oe_flush how); void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver); void ordered_events__free(struct ordered_events *oe); +void ordered_events__reinit(struct ordered_events *oe); static inline void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size) -- GitLab From b26dc73018d2e3a68cad0cf0bad902a8637f9bdf Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Apr 2016 08:21:04 +0000 Subject: [PATCH 247/705] perf session: Make ordered_events reusable ordered_events__free() leaves linked lists and timestamps not cleared, so unable to be reused after ordered_events__free(). Which is inconvenient after 'perf record' supports generating multiple perf.data output and process build-ids for each of them. Use ordered_events__reinit() for this. Signed-off-by: Wang Nan Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460535673-159866-2-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang [ Split from larger patch ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 91d4528d71fa8..ca1827c4af4a4 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1836,7 +1836,11 @@ static int __perf_session__process_events(struct perf_session *session, out_err: ui_progress__finish(); perf_session__warn_about_errors(session); - ordered_events__free(&session->ordered_events); + /* + * We may switching perf.data output, make ordered_events + * reusable. + */ + ordered_events__reinit(&session->ordered_events); auxtrace__free_events(session); session->one_mmap = false; return err; -- GitLab From 040f9915e99e604688c2880e0b1e5b59dbfefa54 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Apr 2016 08:21:05 +0000 Subject: [PATCH 248/705] perf data: Add perf_data_file__switch() helper perf_data_file__switch() closes current output file, renames it, then open a new one to continue recording. It will be used by 'perf record' to split output into multiple perf.data files. Signed-off-by: Wang Nan Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460535673-159866-3-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 41 +++++++++++++++++++++++++++++++++++++++++ tools/perf/util/data.h | 11 ++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 1921942fc2e03..be83516155ee5 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -136,3 +136,44 @@ ssize_t perf_data_file__write(struct perf_data_file *file, { return writen(file->fd, buf, size); } + +int perf_data_file__switch(struct perf_data_file *file, + const char *postfix, + size_t pos, bool at_exit) +{ + char *new_filepath; + int ret; + + if (check_pipe(file)) + return -EINVAL; + if (perf_data_file__is_read(file)) + return -EINVAL; + + if (asprintf(&new_filepath, "%s.%s", file->path, postfix) < 0) + return -ENOMEM; + + /* + * Only fire a warning, don't return error, continue fill + * original file. + */ + if (rename(file->path, new_filepath)) + pr_warning("Failed to rename %s to %s\n", file->path, new_filepath); + + if (!at_exit) { + close(file->fd); + ret = perf_data_file__open(file); + if (ret < 0) + goto out; + + if (lseek(file->fd, pos, SEEK_SET) == (off_t)-1) { + ret = -errno; + pr_debug("Failed to lseek to %zu: %s", + pos, strerror(errno)); + goto out; + } + } + ret = file->fd; +out: + free(new_filepath); + return ret; +} diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 2b15d0c95c7f3..ae510ce16cb12 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -46,5 +46,14 @@ int perf_data_file__open(struct perf_data_file *file); void perf_data_file__close(struct perf_data_file *file); ssize_t perf_data_file__write(struct perf_data_file *file, void *buf, size_t size); - +/* + * If at_exit is set, only rename current perf.data to + * perf.data., continue write on original file. + * Set at_exit when flushing the last output. + * + * Return value is fd of new output. + */ +int perf_data_file__switch(struct perf_data_file *file, + const char *postfix, + size_t pos, bool at_exit); #endif /* __PERF_DATA_H */ -- GitLab From c0bdc1c461dd5b2e492c0746708a3e0da6b13515 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Apr 2016 08:21:06 +0000 Subject: [PATCH 249/705] perf record: Turns auxtrace_snapshot_enable into 3 states auxtrace_snapshot_enable has only two states (0/1). Turns it into a triple states enum so SIGUSR2 handler can safely do other works without triggering auxtrace snapshot. Signed-off-by: Wang Nan Acked-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460535673-159866-4-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 59 ++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index eb6a199a833c2..480033fb9b204 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -125,7 +125,43 @@ static int record__mmap_read(struct record *rec, int idx) static volatile int done; static volatile int signr = -1; static volatile int child_finished; -static volatile int auxtrace_snapshot_enabled; + +static volatile enum { + AUXTRACE_SNAPSHOT_OFF = -1, + AUXTRACE_SNAPSHOT_DISABLED = 0, + AUXTRACE_SNAPSHOT_ENABLED = 1, +} auxtrace_snapshot_state = AUXTRACE_SNAPSHOT_OFF; + +static inline void +auxtrace_snapshot_on(void) +{ + auxtrace_snapshot_state = AUXTRACE_SNAPSHOT_DISABLED; +} + +static inline void +auxtrace_snapshot_enable(void) +{ + if (auxtrace_snapshot_state == AUXTRACE_SNAPSHOT_OFF) + return; + auxtrace_snapshot_state = AUXTRACE_SNAPSHOT_ENABLED; +} + +static inline void +auxtrace_snapshot_disable(void) +{ + if (auxtrace_snapshot_state == AUXTRACE_SNAPSHOT_OFF) + return; + auxtrace_snapshot_state = AUXTRACE_SNAPSHOT_DISABLED; +} + +static inline bool +auxtrace_snapshot_is_enabled(void) +{ + if (auxtrace_snapshot_state == AUXTRACE_SNAPSHOT_OFF) + return false; + return auxtrace_snapshot_state == AUXTRACE_SNAPSHOT_ENABLED; +} + static volatile int auxtrace_snapshot_err; static volatile int auxtrace_record__snapshot_started; @@ -249,7 +285,7 @@ static void record__read_auxtrace_snapshot(struct record *rec) } else { auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr); if (!auxtrace_snapshot_err) - auxtrace_snapshot_enabled = 1; + auxtrace_snapshot_enable(); } } @@ -615,10 +651,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); signal(SIGTERM, sig_handler); - if (rec->opts.auxtrace_snapshot_mode) + + if (rec->opts.auxtrace_snapshot_mode) { signal(SIGUSR2, snapshot_sig_handler); - else + auxtrace_snapshot_on(); + } else { signal(SIGUSR2, SIG_IGN); + } session = perf_session__new(file, false, tool); if (session == NULL) { @@ -744,12 +783,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) perf_evlist__enable(rec->evlist); } - auxtrace_snapshot_enabled = 1; + auxtrace_snapshot_enable(); for (;;) { unsigned long long hits = rec->samples; if (record__mmap_read_all(rec) < 0) { - auxtrace_snapshot_enabled = 0; + auxtrace_snapshot_disable(); err = -1; goto out_child; } @@ -787,12 +826,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) * disable events in this case. */ if (done && !disabled && !target__none(&opts->target)) { - auxtrace_snapshot_enabled = 0; + auxtrace_snapshot_disable(); perf_evlist__disable(rec->evlist); disabled = true; } } - auxtrace_snapshot_enabled = 0; + auxtrace_snapshot_disable(); if (forks && workload_exec_errno) { char msg[STRERR_BUFSIZE]; @@ -1358,9 +1397,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) static void snapshot_sig_handler(int sig __maybe_unused) { - if (!auxtrace_snapshot_enabled) + if (!auxtrace_snapshot_is_enabled()) return; - auxtrace_snapshot_enabled = 0; + auxtrace_snapshot_disable(); auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr); auxtrace_record__snapshot_started = 1; } -- GitLab From ecfd7a9c044e98fc3da8937e652080bc5abe7918 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Apr 2016 08:21:07 +0000 Subject: [PATCH 250/705] perf record: Add '--timestamp-filename' option to append timestamp to output file name This option appends current timestamp to the output file name. For example: # perf record -a --timestamp-filename ^C[ perf record: Woken up 1 times to write data ] [ perf record: Dump perf.data.2015122622265847 ] [ perf record: Captured and wrote 0.742 MB perf.data. (90 samples) ] # ls perf.data.201512262226584 The timestamp will be useful for identifying each perf.data after the 'perf record' support for generating multiple output files gets introduced. Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1460535673-159866-5-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 53 ++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 480033fb9b204..3239a6ec9d230 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -56,6 +56,7 @@ struct record { bool no_buildid_cache; bool no_buildid_cache_set; bool buildid_all; + bool timestamp_filename; unsigned long long samples; }; @@ -531,6 +532,37 @@ record__finish_output(struct record *rec) return; } +static int +record__switch_output(struct record *rec, bool at_exit) +{ + struct perf_data_file *file = &rec->file; + int fd, err; + + /* Same Size: "2015122520103046"*/ + char timestamp[] = "InvalidTimestamp"; + + rec->samples = 0; + record__finish_output(rec); + err = fetch_current_timestamp(timestamp, sizeof(timestamp)); + if (err) { + pr_err("Failed to get current timestamp\n"); + return -EINVAL; + } + + fd = perf_data_file__switch(file, timestamp, + rec->session->header.data_offset, + at_exit); + if (fd >= 0 && !at_exit) { + rec->bytes_written = 0; + rec->session->header.data_size = 0; + } + + if (!quiet) + fprintf(stderr, "[ perf record: Dump %s.%s ]\n", + file->path, timestamp); + return fd; +} + static volatile int workload_exec_errno; /* @@ -865,11 +897,22 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) /* this will be recalculated during process_buildids() */ rec->samples = 0; - if (!err) - record__finish_output(rec); + if (!err) { + if (!rec->timestamp_filename) { + record__finish_output(rec); + } else { + fd = record__switch_output(rec, true); + if (fd < 0) { + status = fd; + goto out_delete_session; + } + } + } if (!err && !quiet) { char samples[128]; + const char *postfix = rec->timestamp_filename ? + "." : ""; if (rec->samples && !rec->opts.full_auxtrace) scnprintf(samples, sizeof(samples), @@ -877,9 +920,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) else samples[0] = '\0'; - fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s ]\n", + fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n", perf_data_file__size(file) / 1024.0 / 1024.0, - file->path, samples); + file->path, postfix, samples); } out_delete_session: @@ -1249,6 +1292,8 @@ struct option __record_options[] = { "file", "vmlinux pathname"), OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, "Record build-id of all DSOs regardless of hits"), + OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, + "append timestamp to output filename"), OPT_END() }; -- GitLab From 20105ca1240c3d3ac8cc79bf195022e5e5c1c3fb Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Thu, 14 Apr 2016 16:53:18 +0900 Subject: [PATCH 251/705] perf config: Introduce perf_config_set class This infrastructure code was designed for upcoming features of 'perf config'. That collect config key-value pairs from user and system config files (i.e. user wide ~/.perfconfig and system wide $(sysconfdir)/perfconfig) to manage perf's configs. Reviewed-by: Masami Hiramatsu Signed-off-by: Taeung Song Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460620401-23430-2-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 173 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/config.h | 26 ++++++ 2 files changed, 199 insertions(+) create mode 100644 tools/perf/util/config.h diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 664490b8b327c..dad7d82721681 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -13,6 +13,7 @@ #include #include "util/hist.h" /* perf_hist_config */ #include "util/llvm-utils.h" /* perf_llvm_config */ +#include "config.h" #define MAXNAME (256) @@ -524,6 +525,178 @@ int perf_config(config_fn_t fn, void *data) return ret; } +static struct perf_config_section *find_section(struct list_head *sections, + const char *section_name) +{ + struct perf_config_section *section; + + list_for_each_entry(section, sections, node) + if (!strcmp(section->name, section_name)) + return section; + + return NULL; +} + +static struct perf_config_item *find_config_item(const char *name, + struct perf_config_section *section) +{ + struct perf_config_item *item; + + list_for_each_entry(item, §ion->items, node) + if (!strcmp(item->name, name)) + return item; + + return NULL; +} + +static struct perf_config_section *add_section(struct list_head *sections, + const char *section_name) +{ + struct perf_config_section *section = zalloc(sizeof(*section)); + + if (!section) + return NULL; + + INIT_LIST_HEAD(§ion->items); + section->name = strdup(section_name); + if (!section->name) { + pr_debug("%s: strdup failed\n", __func__); + free(section); + return NULL; + } + + list_add_tail(§ion->node, sections); + return section; +} + +static struct perf_config_item *add_config_item(struct perf_config_section *section, + const char *name) +{ + struct perf_config_item *item = zalloc(sizeof(*item)); + + if (!item) + return NULL; + + item->name = strdup(name); + if (!item->name) { + pr_debug("%s: strdup failed\n", __func__); + free(item); + return NULL; + } + + list_add_tail(&item->node, §ion->items); + return item; +} + +static int set_value(struct perf_config_item *item, const char *value) +{ + char *val = strdup(value); + + if (!val) + return -1; + + zfree(&item->value); + item->value = val; + return 0; +} + +static int collect_config(const char *var, const char *value, + void *perf_config_set) +{ + int ret = -1; + char *ptr, *key; + char *section_name, *name; + struct perf_config_section *section = NULL; + struct perf_config_item *item = NULL; + struct perf_config_set *set = perf_config_set; + struct list_head *sections = &set->sections; + + key = ptr = strdup(var); + if (!key) { + pr_debug("%s: strdup failed\n", __func__); + return -1; + } + + section_name = strsep(&ptr, "."); + name = ptr; + if (name == NULL || value == NULL) + goto out_free; + + section = find_section(sections, section_name); + if (!section) { + section = add_section(sections, section_name); + if (!section) + goto out_free; + } + + item = find_config_item(name, section); + if (!item) { + item = add_config_item(section, name); + if (!item) + goto out_free; + } + + ret = set_value(item, value); + return ret; + +out_free: + free(key); + perf_config_set__delete(set); + return -1; +} + +struct perf_config_set *perf_config_set__new(void) +{ + struct perf_config_set *set = zalloc(sizeof(*set)); + + if (set) { + INIT_LIST_HEAD(&set->sections); + perf_config(collect_config, set); + } + + return set; +} + +static void perf_config_item__delete(struct perf_config_item *item) +{ + zfree(&item->name); + zfree(&item->value); + free(item); +} + +static void perf_config_section__purge(struct perf_config_section *section) +{ + struct perf_config_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, §ion->items, node) { + list_del_init(&item->node); + perf_config_item__delete(item); + } +} + +static void perf_config_section__delete(struct perf_config_section *section) +{ + perf_config_section__purge(section); + zfree(§ion->name); + free(section); +} + +static void perf_config_set__purge(struct perf_config_set *set) +{ + struct perf_config_section *section, *tmp; + + list_for_each_entry_safe(section, tmp, &set->sections, node) { + list_del_init(§ion->node); + perf_config_section__delete(section); + } +} + +void perf_config_set__delete(struct perf_config_set *set) +{ + perf_config_set__purge(set); + free(set); +} + /* * Call this to report error for your variable that should not * get a boolean value (i.e. "[my] var" means "true"). diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h new file mode 100644 index 0000000000000..22ec626ac7188 --- /dev/null +++ b/tools/perf/util/config.h @@ -0,0 +1,26 @@ +#ifndef __PERF_CONFIG_H +#define __PERF_CONFIG_H + +#include +#include + +struct perf_config_item { + char *name; + char *value; + struct list_head node; +}; + +struct perf_config_section { + char *name; + struct list_head items; + struct list_head node; +}; + +struct perf_config_set { + struct list_head sections; +}; + +struct perf_config_set *perf_config_set__new(void); +void perf_config_set__delete(struct perf_config_set *set); + +#endif /* __PERF_CONFIG_H */ -- GitLab From 860b8d4b3f893c97f905b978ecf62f48816dc5de Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Thu, 14 Apr 2016 16:53:19 +0900 Subject: [PATCH 252/705] perf config: Make show_config() use perf_config_set Currently show_config() has a problem when user and system config files have the same config variables i.e.: # cat ~/.perfconfig [top] children = false When $(sysconfdir) is /usr/local/etc # cat /usr/local/etc/perfconfig [top] children = true Before: # perf config --user --list top.children=false # perf config --system --list top.children=true # perf config --list top.children=true top.children=false Because perf_config() can call show_config() each the config file (user and system). Fix it. After: # perf config --user --list top.children=false # perf config --system --list top.children=true # perf config --list top.children=false Signed-off-by: Taeung Song Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1460620401-23430-3-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-config.c | 39 ++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index c42448ed5dfe2..fe1b77fa21f91 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -12,6 +12,7 @@ #include #include "util/util.h" #include "util/debug.h" +#include "util/config.h" static bool use_system_config, use_user_config; @@ -32,13 +33,28 @@ static struct option config_options[] = { OPT_END() }; -static int show_config(const char *key, const char *value, - void *cb __maybe_unused) +static int show_config(struct perf_config_set *set) { - if (value) - printf("%s=%s\n", key, value); - else - printf("%s\n", key); + struct perf_config_section *section; + struct perf_config_item *item; + struct list_head *sections; + + if (set == NULL) + return -1; + + sections = &set->sections; + if (list_empty(sections)) + return -1; + + list_for_each_entry(section, sections, node) { + list_for_each_entry(item, §ion->items, node) { + char *value = item->value; + + if (value) + printf("%s.%s=%s\n", section->name, + item->name, value); + } + } return 0; } @@ -46,6 +62,7 @@ static int show_config(const char *key, const char *value, int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) { int ret = 0; + struct perf_config_set *set; char *user_config = mkpath("%s/.perfconfig", getenv("HOME")); argc = parse_options(argc, argv, config_options, config_usage, @@ -63,13 +80,19 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) else if (use_user_config) config_exclusive_filename = user_config; + set = perf_config_set__new(); + if (!set) { + ret = -1; + goto out_err; + } + switch (actions) { case ACTION_LIST: if (argc) { pr_err("Error: takes no arguments\n"); parse_options_usage(config_usage, config_options, "l", 1); } else { - ret = perf_config(show_config, NULL); + ret = show_config(set); if (ret < 0) { const char * config_filename = config_exclusive_filename; if (!config_exclusive_filename) @@ -83,5 +106,7 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) usage_with_options(config_usage, config_options); } + perf_config_set__delete(set); +out_err: return ret; } -- GitLab From bbf86c43eace367e805199f94ad8b5a45636f805 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 13:53:10 -0300 Subject: [PATCH 253/705] perf trace: Move socket_type beautifier to tools/perf/trace/beauty/ To reduce the size of builtin-trace.c. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ao91htwxdqwlwxr47gbluou1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 60 +-------------------------- tools/perf/trace/beauty/socket_type.c | 60 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 59 deletions(-) create mode 100644 tools/perf/trace/beauty/socket_type.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8e090a785c5e5..e5f0cc16bb933 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -51,18 +51,6 @@ # define O_CLOEXEC 02000000 #endif -#ifndef SOCK_DCCP -# define SOCK_DCCP 6 -#endif - -#ifndef SOCK_CLOEXEC -# define SOCK_CLOEXEC 02000000 -#endif - -#ifndef SOCK_NONBLOCK -# define SOCK_NONBLOCK 00004000 -#endif - #ifndef MSG_CMSG_CLOEXEC # define MSG_CMSG_CLOEXEC 0x40000000 #endif @@ -538,53 +526,6 @@ static const char *socket_families[] = { }; static DEFINE_STRARRAY(socket_families); -#ifndef SOCK_TYPE_MASK -#define SOCK_TYPE_MASK 0xf -#endif - -static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, - struct syscall_arg *arg) -{ - size_t printed; - int type = arg->val, - flags = type & ~SOCK_TYPE_MASK; - - type &= SOCK_TYPE_MASK; - /* - * Can't use a strarray, MIPS may override for ABI reasons. - */ - switch (type) { -#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break; - P_SK_TYPE(STREAM); - P_SK_TYPE(DGRAM); - P_SK_TYPE(RAW); - P_SK_TYPE(RDM); - P_SK_TYPE(SEQPACKET); - P_SK_TYPE(DCCP); - P_SK_TYPE(PACKET); -#undef P_SK_TYPE - default: - printed = scnprintf(bf, size, "%#x", type); - } - -#define P_SK_FLAG(n) \ - if (flags & SOCK_##n) { \ - printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ - flags &= ~SOCK_##n; \ - } - - P_SK_FLAG(CLOEXEC); - P_SK_FLAG(NONBLOCK); -#undef P_SK_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "|%#x", flags); - - return printed; -} - -#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type - #ifndef MSG_PROBE #define MSG_PROBE 0x10 #endif @@ -951,6 +892,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #include "trace/beauty/mmap.c" #include "trace/beauty/mode_t.c" #include "trace/beauty/sched_policy.c" +#include "trace/beauty/socket_type.c" #include "trace/beauty/waitid_options.c" static struct syscall_fmt { diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c new file mode 100644 index 0000000000000..0a5ce818131ca --- /dev/null +++ b/tools/perf/trace/beauty/socket_type.c @@ -0,0 +1,60 @@ +#include +#include + +#ifndef SOCK_DCCP +# define SOCK_DCCP 6 +#endif + +#ifndef SOCK_CLOEXEC +# define SOCK_CLOEXEC 02000000 +#endif + +#ifndef SOCK_NONBLOCK +# define SOCK_NONBLOCK 00004000 +#endif + +#ifndef SOCK_TYPE_MASK +#define SOCK_TYPE_MASK 0xf +#endif + +static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, struct syscall_arg *arg) +{ + size_t printed; + int type = arg->val, + flags = type & ~SOCK_TYPE_MASK; + + type &= SOCK_TYPE_MASK; + /* + * Can't use a strarray, MIPS may override for ABI reasons. + */ + switch (type) { +#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break; + P_SK_TYPE(STREAM); + P_SK_TYPE(DGRAM); + P_SK_TYPE(RAW); + P_SK_TYPE(RDM); + P_SK_TYPE(SEQPACKET); + P_SK_TYPE(DCCP); + P_SK_TYPE(PACKET); +#undef P_SK_TYPE + default: + printed = scnprintf(bf, size, "%#x", type); + } + +#define P_SK_FLAG(n) \ + if (flags & SOCK_##n) { \ + printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ + flags &= ~SOCK_##n; \ + } + + P_SK_FLAG(CLOEXEC); + P_SK_FLAG(NONBLOCK); +#undef P_SK_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "|%#x", flags); + + return printed; +} + +#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type -- GitLab From 91d7b2de318ff701451dfc7ede1c029b150ef0e9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 14:48:07 -0300 Subject: [PATCH 254/705] perf callchain: Start moving away from global per thread cursors The recent perf_evsel__fprintf_callchain() move to evsel.c added several new symbol requirements to the python binding, for instance: # perf test -v python 16: Try 'import perf' in python, checking link problems : --- start --- test child forked, pid 18030 Traceback (most recent call last): File "", line 1, in ImportError: /tmp/build/perf/python/perf.so: undefined symbol: callchain_cursor test child finished with -1 ---- end ---- Try 'import perf' in python, checking link problems: FAILED! # This would require linking against callchain.c to access to the global callchain_cursor variables. Since lots of functions already receive as a parameter a callchain_cursor struct pointer, make that be the case for some more function so that we can start phasing out usage of yet another global variable. Cc: Adrian Hunter Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-djko3097eyg2rn66v2qcqfvn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 2 +- tools/perf/util/callchain.c | 5 ++-- tools/perf/util/callchain.h | 3 ++- tools/perf/util/evsel.c | 9 ++++--- tools/perf/util/hist.c | 2 +- tools/perf/util/machine.c | 26 +++++++++++-------- tools/perf/util/machine.h | 4 +++ .../util/scripting-engines/trace-event-perl.c | 2 +- .../scripting-engines/trace-event-python.c | 2 +- 9 files changed, 33 insertions(+), 22 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index c9cb3be47cff4..58adfee230de8 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -375,7 +375,7 @@ static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample) } al.thread = machine__findnew_thread(machine, sample->pid, sample->tid); - sample__resolve_callchain(sample, NULL, evsel, &al, 16); + sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16); callchain_cursor_commit(&callchain_cursor); while (true) { diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 24b4bd0d77545..2b4ceaf058bb3 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -788,7 +788,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor, return 0; } -int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent, +int sample__resolve_callchain(struct perf_sample *sample, + struct callchain_cursor *cursor, struct symbol **parent, struct perf_evsel *evsel, struct addr_location *al, int max_stack) { @@ -797,7 +798,7 @@ int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain || sort__has_parent) { - return thread__resolve_callchain(al->thread, evsel, sample, + return thread__resolve_callchain(al->thread, cursor, evsel, sample, parent, al, max_stack); } return 0; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index d2a9e694810c1..cae5a7b1f5c8f 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -212,7 +212,8 @@ struct hist_entry; int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset); int record_callchain_opt(const struct option *opt, const char *arg, int unset); -int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent, +int sample__resolve_callchain(struct perf_sample *sample, + struct callchain_cursor *cursor, struct symbol **parent, struct perf_evsel *evsel, struct addr_location *al, int max_stack); int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *sample); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6e86598682be4..38f464a4fa04f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2349,6 +2349,7 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * FILE *fp) { int printed = 0; + struct callchain_cursor cursor; struct callchain_cursor_node *node; int print_ip = print_opts & EVSEL__PRINT_IP; int print_sym = print_opts & EVSEL__PRINT_SYM; @@ -2362,14 +2363,14 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * if (sample->callchain) { struct addr_location node_al; - if (thread__resolve_callchain(al->thread, evsel, + if (thread__resolve_callchain(al->thread, &cursor, evsel, sample, NULL, NULL, stack_depth) != 0) { if (verbose) error("Failed to resolve callchain. Skipping\n"); return printed; } - callchain_cursor_commit(&callchain_cursor); + callchain_cursor_commit(&cursor); if (print_symoffset) node_al = *al; @@ -2377,7 +2378,7 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * while (stack_depth) { u64 addr = 0; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(&cursor); if (!node) break; @@ -2420,7 +2421,7 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * stack_depth--; next: - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(&cursor); } } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 3d34c57dfbe26..991a351a8a41d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -953,7 +953,7 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, { int err, err2; - err = sample__resolve_callchain(iter->sample, &iter->parent, + err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, iter->evsel, al, max_stack_depth); if (err) return err; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 80b9b6a87990b..0c4dabc699329 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1599,6 +1599,7 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, } static int add_callchain_ip(struct thread *thread, + struct callchain_cursor *cursor, struct symbol **parent, struct addr_location *root_al, u8 *cpumode, @@ -1630,7 +1631,7 @@ static int add_callchain_ip(struct thread *thread, * It seems the callchain is corrupted. * Discard all. */ - callchain_cursor_reset(&callchain_cursor); + callchain_cursor_reset(cursor); return 1; } return 0; @@ -1648,13 +1649,13 @@ static int add_callchain_ip(struct thread *thread, /* Treat this symbol as the root, forgetting its callees. */ *root_al = al; - callchain_cursor_reset(&callchain_cursor); + callchain_cursor_reset(cursor); } } if (symbol_conf.hide_unresolved && al.sym == NULL) return 0; - return callchain_cursor_append(&callchain_cursor, al.addr, al.map, al.sym); + return callchain_cursor_append(cursor, al.addr, al.map, al.sym); } struct branch_info *sample__resolve_bstack(struct perf_sample *sample, @@ -1724,6 +1725,7 @@ static int remove_loops(struct branch_entry *l, int nr) * negative error code on other errors. */ static int resolve_lbr_callchain_sample(struct thread *thread, + struct callchain_cursor *cursor, struct perf_sample *sample, struct symbol **parent, struct addr_location *root_al, @@ -1778,7 +1780,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, ip = lbr_stack->entries[0].to; } - err = add_callchain_ip(thread, parent, root_al, &cpumode, ip); + err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip); if (err) return (err < 0) ? err : 0; } @@ -1789,6 +1791,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, } static int thread__resolve_callchain_sample(struct thread *thread, + struct callchain_cursor *cursor, struct perf_evsel *evsel, struct perf_sample *sample, struct symbol **parent, @@ -1803,10 +1806,10 @@ static int thread__resolve_callchain_sample(struct thread *thread, int skip_idx = -1; int first_call = 0; - callchain_cursor_reset(&callchain_cursor); + callchain_cursor_reset(cursor); if (has_branch_callstack(evsel)) { - err = resolve_lbr_callchain_sample(thread, sample, parent, + err = resolve_lbr_callchain_sample(thread, cursor, sample, parent, root_al, max_stack); if (err) return (err < 0) ? err : 0; @@ -1863,10 +1866,10 @@ static int thread__resolve_callchain_sample(struct thread *thread, nr = remove_loops(be, nr); for (i = 0; i < nr; i++) { - err = add_callchain_ip(thread, parent, root_al, + err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].to); if (!err) - err = add_callchain_ip(thread, parent, root_al, + err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].from); if (err == -EINVAL) break; @@ -1896,7 +1899,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, #endif ip = chain->ips[j]; - err = add_callchain_ip(thread, parent, root_al, &cpumode, ip); + err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip); if (err) return (err < 0) ? err : 0; @@ -1916,13 +1919,14 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) } int thread__resolve_callchain(struct thread *thread, + struct callchain_cursor *cursor, struct perf_evsel *evsel, struct perf_sample *sample, struct symbol **parent, struct addr_location *root_al, int max_stack) { - int ret = thread__resolve_callchain_sample(thread, evsel, + int ret = thread__resolve_callchain_sample(thread, cursor, evsel, sample, parent, root_al, max_stack); if (ret) @@ -1938,7 +1942,7 @@ int thread__resolve_callchain(struct thread *thread, (!sample->user_stack.size)) return 0; - return unwind__get_entries(unwind_entry, &callchain_cursor, + return unwind__get_entries(unwind_entry, cursor, thread, sample, max_stack); } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 8499db2811583..382873bdc5635 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -141,7 +141,11 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, struct addr_location *al); struct mem_info *sample__resolve_mem(struct perf_sample *sample, struct addr_location *al); + +struct callchain_cursor; + int thread__resolve_callchain(struct thread *thread, + struct callchain_cursor *cursor, struct perf_evsel *evsel, struct perf_sample *sample, struct symbol **parent, diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 35ed00a600fbe..ae1cebc307c5b 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -263,7 +263,7 @@ static SV *perl_process_callchain(struct perf_sample *sample, if (!symbol_conf.use_callchain || !sample->callchain) goto exit; - if (thread__resolve_callchain(al->thread, evsel, + if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, PERF_MAX_STACK_DEPTH) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index fbd05242b4e59..525eb49e7ba6b 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -323,7 +323,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample, if (!symbol_conf.use_callchain || !sample->callchain) goto exit; - if (thread__resolve_callchain(al->thread, evsel, + if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); -- GitLab From de446b40d5ddb2c3f1fe453ac405543663f9ac5d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 14:56:06 -0300 Subject: [PATCH 255/705] perf evsel: Remove symbol_conf usage # perf test -v python 16: Try 'import perf' in python, checking link problems : --- start --- test child forked, pid 672 Traceback (most recent call last): File "", line 1, in ImportError: /tmp/build/perf/python/perf.so: undefined symbol: symbol_conf test child finished with -1 ---- end ---- Try 'import perf' in python, checking link problems: FAILED! # To fix it just pass a parameter to perf_evsel__fprintf_sym telling if callchains should be printed. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-comrsr20bsnr8bg0n6rfwv12@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 ++ tools/perf/util/evsel.c | 6 +++--- tools/perf/util/evsel.h | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 838c0bc38105f..717ba02152340 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -580,6 +580,7 @@ static void print_sample_bts(struct perf_sample *sample, } } perf_evsel__fprintf_sym(evsel, sample, al, 0, print_opts, + symbol_conf.use_callchain, scripting_max_stack, stdout); } @@ -790,6 +791,7 @@ static void process_event(struct perf_script *script, perf_evsel__fprintf_sym(evsel, sample, al, 0, output[attr->type].print_ip_opts, + symbol_conf.use_callchain, scripting_max_stack, stdout); } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 38f464a4fa04f..60bba67e6959e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2430,8 +2430,8 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) + unsigned int print_opts, bool print_callchain, + unsigned int stack_depth, FILE *fp) { int printed = 0; int print_ip = print_opts & EVSEL__PRINT_IP; @@ -2441,7 +2441,7 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; - if (symbol_conf.use_callchain && sample->callchain) { + if (print_callchain && sample->callchain) { printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, print_opts, stack_depth, fp); } else if (!(al->sym && al->sym->ignore)) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 36edd3c91d5c0..013f3615730bd 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -403,8 +403,8 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp); + unsigned int print_opts, bool print_callchain, + unsigned int stack_depth, FILE *fp); bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize); -- GitLab From bfbba189b681c86b9ae380358e5f50ce1e33d240 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 15:54:36 -0300 Subject: [PATCH 256/705] perf symbols: Move fprintf routines to separate object file To disentangle symbol printing from all the code related to symbol tables, resolution of addresses to symbols, etc. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-eik9g3hbtdc7ddv57f1d4v3p@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/python-ext-sources | 1 + tools/perf/util/symbol.c | 71 ------------------------------ tools/perf/util/symbol.h | 5 +++ tools/perf/util/symbol_fprintf.c | 71 ++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 71 deletions(-) create mode 100644 tools/perf/util/symbol_fprintf.c diff --git a/tools/perf/util/Build b/tools/perf/util/Build index ea4ac03c1ec81..61021334e9581 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -29,6 +29,7 @@ libperf-y += usage.o libperf-y += wrapper.o libperf-y += dso.o libperf-y += symbol.o +libperf-y += symbol_fprintf.o libperf-y += color.o libperf-y += header.o libperf-y += callchain.o diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 8162ba0e2e57e..36c6862119e32 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -23,3 +23,4 @@ util/strlist.c util/trace-event.c ../lib/rbtree.c util/string.c +util/symbol_fprintf.c diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index bb162ee433c62..a36823c3b7c0d 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -255,57 +255,6 @@ void symbol__delete(struct symbol *sym) free(((void *)sym) - symbol_conf.priv_size); } -size_t symbol__fprintf(struct symbol *sym, FILE *fp) -{ - return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", - sym->start, sym->end, - sym->binding == STB_GLOBAL ? 'g' : - sym->binding == STB_LOCAL ? 'l' : 'w', - sym->name); -} - -size_t __symbol__fprintf_symname_offs(const struct symbol *sym, - const struct addr_location *al, - bool unknown_as_addr, FILE *fp) -{ - unsigned long offset; - size_t length; - - if (sym && sym->name) { - length = fprintf(fp, "%s", sym->name); - if (al) { - if (al->addr < sym->end) - offset = al->addr - sym->start; - else - offset = al->addr - al->map->start - sym->start; - length += fprintf(fp, "+0x%lx", offset); - } - return length; - } else if (al && unknown_as_addr) - return fprintf(fp, "[%#" PRIx64 "]", al->addr); - else - return fprintf(fp, "[unknown]"); -} - -size_t symbol__fprintf_symname_offs(const struct symbol *sym, - const struct addr_location *al, - FILE *fp) -{ - return __symbol__fprintf_symname_offs(sym, al, false, fp); -} - -size_t __symbol__fprintf_symname(const struct symbol *sym, - const struct addr_location *al, - bool unknown_as_addr, FILE *fp) -{ - return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp); -} - -size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp) -{ - return __symbol__fprintf_symname_offs(sym, NULL, false, fp); -} - void symbols__delete(struct rb_root *symbols) { struct symbol *pos; @@ -381,11 +330,6 @@ static struct symbol *symbols__next(struct symbol *sym) return NULL; } -struct symbol_name_rb_node { - struct rb_node rb_node; - struct symbol sym; -}; - static void symbols__insert_by_name(struct rb_root *symbols, struct symbol *sym) { struct rb_node **p = &symbols->rb_node; @@ -514,21 +458,6 @@ void dso__sort_by_name(struct dso *dso, enum map_type type) &dso->symbols[type]); } -size_t dso__fprintf_symbols_by_name(struct dso *dso, - enum map_type type, FILE *fp) -{ - size_t ret = 0; - struct rb_node *nd; - struct symbol_name_rb_node *pos; - - for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) { - pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); - fprintf(fp, "%s\n", pos->sym.name); - } - - return ret; -} - int modules__parse(const char *filename, void *arg, int (*process_module)(void *arg, const char *name, u64 start)) diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index e2562568418d8..1da7b101bc7f8 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -140,6 +140,11 @@ struct symbol_conf { extern struct symbol_conf symbol_conf; +struct symbol_name_rb_node { + struct rb_node rb_node; + struct symbol sym; +}; + static inline int __symbol__join_symfs(char *bf, size_t size, const char *path) { return path__join(bf, size, symbol_conf.symfs, path); diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c new file mode 100644 index 0000000000000..a680bdaa65dc3 --- /dev/null +++ b/tools/perf/util/symbol_fprintf.c @@ -0,0 +1,71 @@ +#include +#include +#include + +#include "symbol.h" + +size_t symbol__fprintf(struct symbol *sym, FILE *fp) +{ + return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", + sym->start, sym->end, + sym->binding == STB_GLOBAL ? 'g' : + sym->binding == STB_LOCAL ? 'l' : 'w', + sym->name); +} + +size_t __symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) +{ + unsigned long offset; + size_t length; + + if (sym && sym->name) { + length = fprintf(fp, "%s", sym->name); + if (al) { + if (al->addr < sym->end) + offset = al->addr - sym->start; + else + offset = al->addr - al->map->start - sym->start; + length += fprintf(fp, "+0x%lx", offset); + } + return length; + } else if (al && unknown_as_addr) + return fprintf(fp, "[%#" PRIx64 "]", al->addr); + else + return fprintf(fp, "[unknown]"); +} + +size_t symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, false, fp); +} + +size_t __symbol__fprintf_symname(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp); +} + +size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, NULL, false, fp); +} + +size_t dso__fprintf_symbols_by_name(struct dso *dso, + enum map_type type, FILE *fp) +{ + size_t ret = 0; + struct rb_node *nd; + struct symbol_name_rb_node *pos; + + for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) { + pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); + fprintf(fp, "%s\n", pos->sym.name); + } + + return ret; +} -- GitLab From 6f736735e30f51805f6be31d20a4bf5b0ae91bae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 17:45:51 -0300 Subject: [PATCH 257/705] perf evsel: Require that callchains be resolved before calling fprintf_{sym,callchain} This way the print routine merely does printing, not requiring access to the resolving machinery, which helps disentangling the object files and easing creating subsets with a limited functionality set. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-2ti2jbra8fypdfawwwm3aee3@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 36 +++++++++++++++++++--------------- tools/perf/builtin-trace.c | 8 +++++--- tools/perf/util/evsel.c | 39 +++++++++++++------------------------ tools/perf/util/evsel.h | 19 +++++++++--------- 4 files changed, 48 insertions(+), 54 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 717ba02152340..875d84e7ba5ba 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -569,19 +569,23 @@ static void print_sample_bts(struct perf_sample *sample, /* print branch_from information */ if (PRINT_FIELD(IP)) { unsigned int print_opts = output[attr->type].print_ip_opts; + struct callchain_cursor *cursor = NULL, cursor_callchain; - if (symbol_conf.use_callchain && sample->callchain) { - printf("\n"); - } else { - printf(" "); + if (symbol_conf.use_callchain && sample->callchain && + thread__resolve_callchain(al->thread, &cursor_callchain, evsel, + sample, NULL, NULL, scripting_max_stack) == 0) + cursor = &cursor_callchain; + + if (cursor == NULL) { + putchar(' '); if (print_opts & EVSEL__PRINT_SRCLINE) { print_srcline_last = true; print_opts &= ~EVSEL__PRINT_SRCLINE; } - } - perf_evsel__fprintf_sym(evsel, sample, al, 0, print_opts, - symbol_conf.use_callchain, - scripting_max_stack, stdout); + } else + putchar('\n'); + + sample__fprintf_sym(sample, al, 0, print_opts, cursor, stdout); } /* print branch_to information */ @@ -784,15 +788,15 @@ static void process_event(struct perf_script *script, printf("%16" PRIu64, sample->weight); if (PRINT_FIELD(IP)) { - if (!symbol_conf.use_callchain) - printf(" "); - else - printf("\n"); + struct callchain_cursor *cursor = NULL, cursor_callchain; + + if (symbol_conf.use_callchain && + thread__resolve_callchain(al->thread, &cursor_callchain, evsel, + sample, NULL, NULL, scripting_max_stack) == 0) + cursor = &cursor_callchain; - perf_evsel__fprintf_sym(evsel, sample, al, 0, - output[attr->type].print_ip_opts, - symbol_conf.use_callchain, - scripting_max_stack, stdout); + putchar(cursor ? '\n' : ' '); + sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout); } if (PRINT_FIELD(IREGS)) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index e5f0cc16bb933..0e2a82bda22f8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1890,14 +1890,16 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evse if (sample->callchain == NULL) return 0; - if (machine__resolve(trace->host, &al, sample) < 0) { + if (machine__resolve(trace->host, &al, sample) < 0 || + thread__resolve_callchain(al.thread, &callchain_cursor, evsel, + sample, NULL, NULL, scripting_max_stack)) { pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); return 0; } - return perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts, - scripting_max_stack, trace->output); + return sample__fprintf_callchain(sample, &al, 38, print_opts, + &callchain_cursor, trace->output); } static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 60bba67e6959e..35c5a52822396 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2343,13 +2343,12 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, return ++printed; } -int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, unsigned int stack_depth, - FILE *fp) +int sample__fprintf_callchain(struct perf_sample *sample, + struct addr_location *al, int left_alignment, + unsigned int print_opts, struct callchain_cursor *cursor, + FILE *fp) { int printed = 0; - struct callchain_cursor cursor; struct callchain_cursor_node *node; int print_ip = print_opts & EVSEL__PRINT_IP; int print_sym = print_opts & EVSEL__PRINT_SYM; @@ -2363,22 +2362,15 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * if (sample->callchain) { struct addr_location node_al; - if (thread__resolve_callchain(al->thread, &cursor, evsel, - sample, NULL, NULL, - stack_depth) != 0) { - if (verbose) - error("Failed to resolve callchain. Skipping\n"); - return printed; - } - callchain_cursor_commit(&cursor); + callchain_cursor_commit(cursor); if (print_symoffset) node_al = *al; - while (stack_depth) { + while (1) { u64 addr = 0; - node = callchain_cursor_current(&cursor); + node = callchain_cursor_current(cursor); if (!node) break; @@ -2418,20 +2410,17 @@ int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, struct perf_sample * if (!print_oneline) printed += fprintf(fp, "\n"); - - stack_depth--; next: - callchain_cursor_advance(&cursor); + callchain_cursor_advance(cursor); } } return printed; } -int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, bool print_callchain, - unsigned int stack_depth, FILE *fp) +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp) { int printed = 0; int print_ip = print_opts & EVSEL__PRINT_IP; @@ -2441,9 +2430,9 @@ int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; - if (print_callchain && sample->callchain) { - printed += perf_evsel__fprintf_callchain(evsel, sample, al, left_alignment, - print_opts, stack_depth, fp); + if (cursor != NULL) { + printed += sample__fprintf_callchain(sample, al, left_alignment, + print_opts, cursor, fp); } else if (!(al->sym && al->sym->ignore)) { printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 013f3615730bd..abadfea1dbaa9 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -395,16 +395,15 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, #define EVSEL__PRINT_SRCLINE (1<<5) #define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) -int perf_evsel__fprintf_callchain(struct perf_evsel *evsel, - struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, - unsigned int stack_depth, FILE *fp); - -int perf_evsel__fprintf_sym(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, int left_alignment, - unsigned int print_opts, bool print_callchain, - unsigned int stack_depth, FILE *fp); +struct callchain_cursor; + +int sample__fprintf_callchain(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp); + +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp); bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize); -- GitLab From d327e60cfae2201bcdee5aeb9b5a42e3988b184f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 17:53:49 -0300 Subject: [PATCH 258/705] perf tools: Remove addr_location argument to sample__fprintf_callchain Not used at all, nuke it. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-jf2w8ce8nl3wso3vuodg5jci@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 3 +-- tools/perf/util/evsel.c | 8 ++------ tools/perf/util/evsel.h | 4 ++-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0e2a82bda22f8..5e5a95e34a531 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1898,8 +1898,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evse return 0; } - return sample__fprintf_callchain(sample, &al, 38, print_opts, - &callchain_cursor, trace->output); + return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); } static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 35c5a52822396..060f619dea883 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2343,8 +2343,7 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, return ++printed; } -int sample__fprintf_callchain(struct perf_sample *sample, - struct addr_location *al, int left_alignment, +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, unsigned int print_opts, struct callchain_cursor *cursor, FILE *fp) { @@ -2364,9 +2363,6 @@ int sample__fprintf_callchain(struct perf_sample *sample, callchain_cursor_commit(cursor); - if (print_symoffset) - node_al = *al; - while (1) { u64 addr = 0; @@ -2431,7 +2427,7 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; if (cursor != NULL) { - printed += sample__fprintf_callchain(sample, al, left_alignment, + printed += sample__fprintf_callchain(sample, left_alignment, print_opts, cursor, fp); } else if (!(al->sym && al->sym->ignore)) { printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index abadfea1dbaa9..b993218744d46 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -397,8 +397,8 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, struct callchain_cursor; -int sample__fprintf_callchain(struct perf_sample *sample, struct addr_location *al, - int left_alignment, unsigned int print_opts, +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, + unsigned int print_opts, struct callchain_cursor *cursor, FILE *fp); int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, -- GitLab From 6125cc8dac432948a31df4d4ac20dd2d4f8c6c27 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 18:15:18 -0300 Subject: [PATCH 259/705] perf script: Add --max-stack knob Works just like with 'perf report'. In some cases we may want to have more than 127 entries, the default maximum. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-mqkz2p5ok2978gztb0vsnocc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 10 ++++++++++ tools/perf/builtin-script.c | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 22ef3933342ad..4fc44c75263fd 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -259,6 +259,16 @@ include::itrace.txt[] --full-source-path:: Show the full path for source files for srcline output. +--max-stack:: + Set the stack depth limit when parsing the callchain, anything + beyond the specified depth will be ignored. This is a trade-off + between information loss and faster processing especially for + workloads that can have a very long callchain stack. + Note that when using the --itrace option the synthesized callchain size + will override this value if the synthesized callchain size is bigger. + + Default: 127 + --ns:: Use 9 decimal places when displaying time (i.e. show the nanoseconds) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 875d84e7ba5ba..0e93282b405ed 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -22,6 +22,7 @@ #include "util/thread_map.h" #include "util/stat.h" #include +#include #include "asm/bug.h" #include "util/mem-events.h" @@ -2027,6 +2028,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "only consider symbols in these pids"), OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]", "only consider symbols in these tids"), + OPT_UINTEGER(0, "max-stack", &scripting_max_stack, + "Set the maximum stack depth when parsing the callchain, " + "anything beyond the specified depth will be ignored. " + "Default: " __stringify(PERF_MAX_STACK_DEPTH)), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path, -- GitLab From c6d4a494a207a336b45e52a44550150964daf1ce Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 18:29:08 -0300 Subject: [PATCH 260/705] perf trace: Add --max-stack knob Similar to the one in the other tools (report, script, top). Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-lh7kk5a5t3erwxw31ah0cgar@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 9 +++++++++ tools/perf/builtin-trace.c | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 1bbcf305d2331..2ee0c4fee18d3 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -129,6 +129,15 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --event:: Trace other events, see 'perf list' for a complete list. +--max-stack:: + Set the stack depth limit when parsing the callchain, anything + beyond the specified depth will be ignored. Note that at this point + this is just about the presentation part, i.e. the kernel is still + not limiting, the overhead of callchains needs to be set via the + knobs in --call-graph dwarf. + + Default: 127 + --proc-map-timeout:: When processing pre-existing threads /proc/XXX/mmap, it may take a long time, because the file may be huge. A time out is needed in such cases. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5e5a95e34a531..39a158923acf8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -46,6 +46,7 @@ #include #include #include +#include #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 @@ -106,6 +107,7 @@ struct trace { u64 vfs_getname, proc_getname; } stats; + unsigned int max_stack; bool not_ev_qualifier; bool live; bool full_time; @@ -1892,7 +1894,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evse if (machine__resolve(trace->host, &al, sample) < 0 || thread__resolve_callchain(al.thread, &callchain_cursor, evsel, - sample, NULL, NULL, scripting_max_stack)) { + sample, NULL, NULL, trace->max_stack)) { pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); return 0; @@ -3029,6 +3031,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .show_comm = true, .trace_syscalls = true, .kernel_syscallchains = false, + .max_stack = PERF_MAX_STACK_DEPTH, }; const char *output_name = NULL; const char *ev_qualifier_str = NULL; @@ -3079,6 +3082,10 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) &record_parse_callchain_opt), OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, "Show the kernel callchains on the syscall exit path"), + OPT_UINTEGER(0, "max-stack", &trace.max_stack, + "Set the maximum stack depth when parsing the callchain, " + "anything beyond the specified depth will be ignored. " + "Default: " __stringify(PERF_MAX_STACK_DEPTH)), OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), OPT_END() -- GitLab From 25da4fab5f66e659da768cd61dbf8c3861104d7c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Apr 2016 19:45:01 -0300 Subject: [PATCH 261/705] perf evsel: Move fprintf methods to separate source file They still use functions that would drag more stuff to the python binding, where these fprintf methods are not used, so separate it. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-xfp0mgq3hh3px61di6ixi1jk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/evsel.c | 206 ------------------------------- tools/perf/util/evsel_fprintf.c | 212 ++++++++++++++++++++++++++++++++ 3 files changed, 213 insertions(+), 206 deletions(-) create mode 100644 tools/perf/util/evsel_fprintf.c diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 61021334e9581..85a9ab62e23fe 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -8,6 +8,7 @@ libperf-y += env.o libperf-y += event.o libperf-y += evlist.o libperf-y += evsel.o +libperf-y += evsel_fprintf.o libperf-y += find_bit.o libperf-y += kallsyms.o libperf-y += levenshtein.o diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 060f619dea883..545bb3f0b2b06 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2254,212 +2254,6 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, return 0; } -static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) -{ - va_list args; - int ret = 0; - - if (!*first) { - ret += fprintf(fp, ","); - } else { - ret += fprintf(fp, ":"); - *first = false; - } - - va_start(args, fmt); - ret += vfprintf(fp, fmt, args); - va_end(args); - return ret; -} - -static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv) -{ - return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val); -} - -int perf_evsel__fprintf(struct perf_evsel *evsel, - struct perf_attr_details *details, FILE *fp) -{ - bool first = true; - int printed = 0; - - if (details->event_group) { - struct perf_evsel *pos; - - if (!perf_evsel__is_group_leader(evsel)) - return 0; - - if (evsel->nr_members > 1) - printed += fprintf(fp, "%s{", evsel->group_name ?: ""); - - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); - for_each_group_member(pos, evsel) - printed += fprintf(fp, ",%s", perf_evsel__name(pos)); - - if (evsel->nr_members > 1) - printed += fprintf(fp, "}"); - goto out; - } - - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); - - if (details->verbose) { - printed += perf_event_attr__fprintf(fp, &evsel->attr, - __print_attr__fprintf, &first); - } else if (details->freq) { - const char *term = "sample_freq"; - - if (!evsel->attr.freq) - term = "sample_period"; - - printed += comma_fprintf(fp, &first, " %s=%" PRIu64, - term, (u64)evsel->attr.sample_freq); - } - - if (details->trace_fields) { - struct format_field *field; - - if (evsel->attr.type != PERF_TYPE_TRACEPOINT) { - printed += comma_fprintf(fp, &first, " (not a tracepoint)"); - goto out; - } - - field = evsel->tp_format->format.fields; - if (field == NULL) { - printed += comma_fprintf(fp, &first, " (no trace field)"); - goto out; - } - - printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name); - - field = field->next; - while (field) { - printed += comma_fprintf(fp, &first, "%s", field->name); - field = field->next; - } - } -out: - fputc('\n', fp); - return ++printed; -} - -int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, - unsigned int print_opts, struct callchain_cursor *cursor, - FILE *fp) -{ - int printed = 0; - struct callchain_cursor_node *node; - int print_ip = print_opts & EVSEL__PRINT_IP; - int print_sym = print_opts & EVSEL__PRINT_SYM; - int print_dso = print_opts & EVSEL__PRINT_DSO; - int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; - int print_oneline = print_opts & EVSEL__PRINT_ONELINE; - int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; - int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; - char s = print_oneline ? ' ' : '\t'; - - if (sample->callchain) { - struct addr_location node_al; - - callchain_cursor_commit(cursor); - - while (1) { - u64 addr = 0; - - node = callchain_cursor_current(cursor); - if (!node) - break; - - if (node->sym && node->sym->ignore) - goto next; - - printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); - - if (print_ip) - printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); - - if (node->map) - addr = node->map->map_ip(node->map, node->ip); - - if (print_sym) { - printed += fprintf(fp, " "); - node_al.addr = addr; - node_al.map = node->map; - - if (print_symoffset) { - printed += __symbol__fprintf_symname_offs(node->sym, &node_al, - print_unknown_as_addr, fp); - } else { - printed += __symbol__fprintf_symname(node->sym, &node_al, - print_unknown_as_addr, fp); - } - } - - if (print_dso) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(node->map, fp); - printed += fprintf(fp, ")"); - } - - if (print_srcline) - printed += map__fprintf_srcline(node->map, addr, "\n ", fp); - - if (!print_oneline) - printed += fprintf(fp, "\n"); -next: - callchain_cursor_advance(cursor); - } - } - - return printed; -} - -int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, - int left_alignment, unsigned int print_opts, - struct callchain_cursor *cursor, FILE *fp) -{ - int printed = 0; - int print_ip = print_opts & EVSEL__PRINT_IP; - int print_sym = print_opts & EVSEL__PRINT_SYM; - int print_dso = print_opts & EVSEL__PRINT_DSO; - int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; - int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; - int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; - - if (cursor != NULL) { - printed += sample__fprintf_callchain(sample, left_alignment, - print_opts, cursor, fp); - } else if (!(al->sym && al->sym->ignore)) { - printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); - - if (print_ip) - printed += fprintf(fp, "%16" PRIx64, sample->ip); - - if (print_sym) { - printed += fprintf(fp, " "); - if (print_symoffset) { - printed += __symbol__fprintf_symname_offs(al->sym, al, - print_unknown_as_addr, fp); - } else { - printed += __symbol__fprintf_symname(al->sym, al, - print_unknown_as_addr, fp); - } - } - - if (print_dso) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(al->map, fp); - printed += fprintf(fp, ")"); - } - - if (print_srcline) - printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - } - - return printed; -} - - bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize) { diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c new file mode 100644 index 0000000000000..3674e77ad6404 --- /dev/null +++ b/tools/perf/util/evsel_fprintf.c @@ -0,0 +1,212 @@ +#include +#include +#include +#include "evsel.h" +#include "callchain.h" +#include "map.h" +#include "symbol.h" + +static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) +{ + va_list args; + int ret = 0; + + if (!*first) { + ret += fprintf(fp, ","); + } else { + ret += fprintf(fp, ":"); + *first = false; + } + + va_start(args, fmt); + ret += vfprintf(fp, fmt, args); + va_end(args); + return ret; +} + +static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv) +{ + return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val); +} + +int perf_evsel__fprintf(struct perf_evsel *evsel, + struct perf_attr_details *details, FILE *fp) +{ + bool first = true; + int printed = 0; + + if (details->event_group) { + struct perf_evsel *pos; + + if (!perf_evsel__is_group_leader(evsel)) + return 0; + + if (evsel->nr_members > 1) + printed += fprintf(fp, "%s{", evsel->group_name ?: ""); + + printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + for_each_group_member(pos, evsel) + printed += fprintf(fp, ",%s", perf_evsel__name(pos)); + + if (evsel->nr_members > 1) + printed += fprintf(fp, "}"); + goto out; + } + + printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + + if (details->verbose) { + printed += perf_event_attr__fprintf(fp, &evsel->attr, + __print_attr__fprintf, &first); + } else if (details->freq) { + const char *term = "sample_freq"; + + if (!evsel->attr.freq) + term = "sample_period"; + + printed += comma_fprintf(fp, &first, " %s=%" PRIu64, + term, (u64)evsel->attr.sample_freq); + } + + if (details->trace_fields) { + struct format_field *field; + + if (evsel->attr.type != PERF_TYPE_TRACEPOINT) { + printed += comma_fprintf(fp, &first, " (not a tracepoint)"); + goto out; + } + + field = evsel->tp_format->format.fields; + if (field == NULL) { + printed += comma_fprintf(fp, &first, " (no trace field)"); + goto out; + } + + printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name); + + field = field->next; + while (field) { + printed += comma_fprintf(fp, &first, "%s", field->name); + field = field->next; + } + } +out: + fputc('\n', fp); + return ++printed; +} + +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, + unsigned int print_opts, struct callchain_cursor *cursor, + FILE *fp) +{ + int printed = 0; + struct callchain_cursor_node *node; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_oneline = print_opts & EVSEL__PRINT_ONELINE; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + char s = print_oneline ? ' ' : '\t'; + + if (sample->callchain) { + struct addr_location node_al; + + callchain_cursor_commit(cursor); + + while (1) { + u64 addr = 0; + + node = callchain_cursor_current(cursor); + if (!node) + break; + + if (node->sym && node->sym->ignore) + goto next; + + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); + + if (node->map) + addr = node->map->map_ip(node->map, node->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + node_al.addr = addr; + node_al.map = node->map; + + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(node->sym, &node_al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(node->sym, &node_al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(node->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(node->map, addr, "\n ", fp); + + if (!print_oneline) + printed += fprintf(fp, "\n"); +next: + callchain_cursor_advance(cursor); + } + } + + return printed; +} + +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp) +{ + int printed = 0; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + + if (cursor != NULL) { + printed += sample__fprintf_callchain(sample, left_alignment, + print_opts, cursor, fp); + } else if (!(al->sym && al->sym->ignore)) { + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%16" PRIx64, sample->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(al->sym, al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(al->sym, al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(al->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); + } + + return printed; +} -- GitLab From dec8e8f6e6504aa3496c0f7cc10c756bb0e10f44 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 14 Apr 2016 23:37:26 -0700 Subject: [PATCH 262/705] regmap: spmi: Fix regmap_spmi_ext_read in multi-byte case Specifically for the case of reads that use the Extended Register Read Long command, a multi-byte read operation is broken up into 8-byte chunks. However the call to spmi_ext_register_readl() is incorrectly passing 'val_size', which if greater than 8 will always fail. The argument should instead be 'len'. Fixes: c9afbb05a9ff ("regmap: spmi: support base and extended register spaces") Signed-off-by: Jack Pham Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/base/regmap/regmap-spmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c index 7e58f65603990..4a36e415e9385 100644 --- a/drivers/base/regmap/regmap-spmi.c +++ b/drivers/base/regmap/regmap-spmi.c @@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context, while (val_size) { len = min_t(size_t, val_size, 8); - err = spmi_ext_register_readl(context, addr, val, val_size); + err = spmi_ext_register_readl(context, addr, val, len); if (err) goto err_out; -- GitLab From e519bd9a07fe5b13c47b506d0fbadb7498e60607 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 10:20:10 -0300 Subject: [PATCH 263/705] perf trace: Do not print interrupted syscalls when using --duration With multiple threads, e.g. a system wide trace session, and one syscall is midway in a thread and another thread starts another syscall we must print the start of the interrupted syscall followed by ..., but that can't be done that way when we use the --duration filter, as we have to wait for the syscall exit to calculate the duration and decide if it should be filtered, so we have to disable the interrupted logic and only print at syscall exit, duh. Before: # trace --duration 100 9.248 (0.023 ms): gnome-shell/2287 poll(ufds: 0x7ffc5ea26580, nfds: 1, timeout_msecs: 4294967295) ... 9.296 (0.001 ms): gnome-shell/2287 recvmsg(fd: 11, msg: 0x7ffc5ea264a0 ) ... 9.311 (0.008 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 9.859 (0.023 ms): gnome-shell/2287 poll(ufds: 0x7ffc5ea24250, nfds: 1, timeout_msecs: 4294967295) ... 9.942 (0.051 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 10.467 (0.003 ms): gnome-shell/2287 poll(ufds: 0x55e623431220, nfds: 50, timeout_msecs: 4294967295) ... 11.136 (0.382 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 11.223 (0.023 ms): SoftwareVsyncT/24369 futex(uaddr: 0x7f5ec5df8c14, op: WAIT_BITSET|PRIV, val: 1, utime: 0x7f5ec5df8b68, val3: 4294967295) ... 16.865 (5.501 ms): firefox/24321 poll(ufds: 0x7f5ec388b460, nfds: 6, timeout_msecs: 4294967295 ) ... 22.571 (0.006 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 26.793 (4.063 ms): gnome-shell/2287 poll(ufds: 0x55e623431220, nfds: 50, timeout_msecs: 4294967295) ... 26.917 (0.080 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 27.291 (0.355 ms): qemu-system-x8/10065 ppoll(ufds: 0x55c98b39e400, nfds: 72, tsp: 0x7fffe4e4fe60, sigsetsize: 8) ... 27.336 (0.012 ms): SoftwareVsyncT/24369 futex(uaddr: 0x7f5ec5df8c14, op: WAIT_BITSET|PRIV, val: 1, utime: 0x7f5ec5df8b68, val3: 4294967295) ... 33.370 (5.958 ms): firefox/24321 poll(ufds: 0x7f5ec388b460, nfds: 6, timeout_msecs: 4294967295) ... 33.866 (0.021 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... 35.762 (1.611 ms): gnome-shell/2287 poll(ufds: 0x55e623431220, nfds: 50, timeout_msecs: 8 ) ... 38.765 (2.910 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) ... After: # trace --duration 100 238.292 (153.226 ms): hexchat/2786 poll(ufds: 0x559ea372f370, nfds: 6, timeout_msecs: 153) = 0 Timeout 249.634 (199.433 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x7ffdcbb63610 ) = 1 385.583 (147.257 ms): hexchat/2786 poll(ufds: 0x559ea372f370, nfds: 6, timeout_msecs: 147) = 0 Timeout 397.166 (110.779 ms): gnome-shell/2287 poll(ufds: 0x55e623431220, nfds: 50, timeout_msecs: 4294967295) = 1 601.839 (132.066 ms): Xorg/2025 select(n: 512, inp: 0x83a8e0, tvp: 0x8316a0 ) = 1 602.445 (132.679 ms): gnome-shell/2287 poll(ufds: 0x55e623431220, nfds: 50, timeout_msecs: 4294967295) = 1 686.122 (300.418 ms): hexchat/2786 poll(ufds: 0x559ea372f370, nfds: 6, timeout_msecs: 300) = 0 Timeout 815.033 (184.641 ms): JS Helper/24352 futex(uaddr: 0x7f5ed98e584c, op: WAIT|PRIV, val: 1149859) = 0 825.868 (195.469 ms): JS Helper/24351 futex(uaddr: 0x7f5ed98e584c, op: WAIT|PRIV, val: 1149860) = 0 840.738 (210.335 ms): JS Helper/24350 futex(uaddr: 0x7f5ed98e584c, op: WAIT|PRIV, val: 1149861) = 0 914.898 (158.692 ms): Compositor/24363 futex(uaddr: 0x7f5ec8dfebf4, op: WAIT|PRIV, val: 1) = 0 915.199 (100.747 ms): Timer/24358 futex(uaddr: 0x7f5ed98e56cc, op: WAIT_BITSET|PRIV|CLKRT, val: 2545397, utime: 0x7f5ecdbfec30, val3: 4294967295) = 0 986.639 (247.325 ms): hexchat/2786 poll(ufds: 0x559ea372f370, nfds: 6, timeout_msecs: 247) = 0 Timeout 996.239 (500.591 ms): chrome/16237 poll(ufds: 0x3ecd739bd0, nfds: 5, timeout_msecs: 500) = 0 Timeout 1042.890 (120.076 ms): Timer/24358 futex(uaddr: 0x7f5ed98e56cc, op: WAIT_BITSET|PRIV|CLKRT, val: 2545403, utime: 0x7f5ecdbfec30, val3: 4294967295) = -1 ETIMEDOUT Connection timed out Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-d2nay6kjax5ro991c9kelvi5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 39a158923acf8..65f6abe75d714 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1849,7 +1849,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, goto out_put; } - if (!trace->summary_only) + if (!(trace->duration_filter || trace->summary_only)) trace__printf_interrupted_entry(trace, sample); ttrace->entry_time = sample->time; -- GitLab From 5cf9c84e21067ec7a44648aedbc38c197d707258 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 11:10:31 -0300 Subject: [PATCH 264/705] perf trace: Introduce --min-stack filter Counterpart to --max-stack, to help focusing on deeply nested calls. Can be combined with --duration, etc. E.g.: System wide syscall tracing looking for call stacks longer than 66: # trace --mmap-pages 32768 --filter-pid 2711 --call-graph dwarf,16384 --min-stack 66 Or more compactly: # trace -m 32768 --filt 2711 --call dwarf,16384 --min-st 66 363.027 ( 0.002 ms): gnome-shell/2287 poll(ufds: 0x7ffc5ea24230, nfds: 1, timeout_msecs: 4294967295 ) = 1 [0xf6fdd] (/usr/lib64/libc-2.22.so) _xcb_conn_wait+0x92 (/usr/lib64/libxcb.so.1.1.0) _xcb_out_send+0x4d (/usr/lib64/libxcb.so.1.1.0) xcb_writev+0x45 (/usr/lib64/libxcb.so.1.1.0) _XSend+0x19e (/usr/lib64/libX11.so.6.3.0) _XReply+0x82 (/usr/lib64/libX11.so.6.3.0) XSync+0x4d (/usr/lib64/libX11.so.6.3.0) dri3_bind_tex_image+0x42 (/usr/lib64/libGL.so.1.2.0) _cogl_winsys_texture_pixmap_x11_update+0x117 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_update+0x67 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_pre_paint+0x13 (/usr/lib64/libcogl.so.20.4.1) _cogl_pipeline_layer_pre_paint+0x5e (/usr/lib64/libcogl.so.20.4.1) _cogl_rectangles_validate_layer_cb+0x1b (/usr/lib64/libcogl.so.20.4.1) cogl_pipeline_foreach_layer+0xbe (/usr/lib64/libcogl.so.20.4.1) _cogl_framebuffer_draw_multitextured_rectangles+0x77 (/usr/lib64/libcogl.so.20.4.1) cogl_framebuffer_draw_multitextured_rectangle+0x51 (/usr/lib64/libcogl.so.20.4.1) paint_clipped_rectangle+0xb6 (/usr/lib64/libmutter.so.0.0.0) meta_shaped_texture_paint+0x3e3 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_actor_paint+0x14b (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_group_paint+0x19f (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) [0x3d970] (/usr/lib64/gnome-shell/libgnome-shell.so) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_paint+0x3a (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_stage_paint+0x45 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0x164 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_paint+0x17b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_cogl_redraw+0x496 (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_update+0x117 (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_clock_dispatch+0x169 (/usr/lib64/libclutter-1.0.so.0.2400.2) g_main_context_dispatch+0x15a (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_context_iterate.isra.29+0x1e0 (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_loop_run+0xc2 (/usr/lib64/libglib-2.0.so.0.4600.2) meta_run+0x2c (/usr/lib64/libmutter.so.0.0.0) main+0x3f7 (/usr/bin/gnome-shell) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) [0x2909] (/usr/bin/gnome-shell) 363.038 ( 0.006 ms): gnome-shell/2287 writev(fd: 5, vec: 0x7ffc5ea243a0, vlen: 3 ) = 4 __GI___writev+0x2d (/usr/lib64/libc-2.22.so) _xcb_conn_wait+0x359 (/usr/lib64/libxcb.so.1.1.0) _xcb_out_send+0x4d (/usr/lib64/libxcb.so.1.1.0) xcb_writev+0x45 (/usr/lib64/libxcb.so.1.1.0) _XSend+0x19e (/usr/lib64/libX11.so.6.3.0) _XReply+0x82 (/usr/lib64/libX11.so.6.3.0) XSync+0x4d (/usr/lib64/libX11.so.6.3.0) dri3_bind_tex_image+0x42 (/usr/lib64/libGL.so.1.2.0) _cogl_winsys_texture_pixmap_x11_update+0x117 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_update+0x67 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_pre_paint+0x13 (/usr/lib64/libcogl.so.20.4.1) _cogl_pipeline_layer_pre_paint+0x5e (/usr/lib64/libcogl.so.20.4.1) _cogl_rectangles_validate_layer_cb+0x1b (/usr/lib64/libcogl.so.20.4.1) cogl_pipeline_foreach_layer+0xbe (/usr/lib64/libcogl.so.20.4.1) _cogl_framebuffer_draw_multitextured_rectangles+0x77 (/usr/lib64/libcogl.so.20.4.1) cogl_framebuffer_draw_multitextured_rectangle+0x51 (/usr/lib64/libcogl.so.20.4.1) paint_clipped_rectangle+0xb6 (/usr/lib64/libmutter.so.0.0.0) meta_shaped_texture_paint+0x3e3 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_actor_paint+0x14b (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_group_paint+0x19f (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) [0x3d970] (/usr/lib64/gnome-shell/libgnome-shell.so) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_paint+0x3a (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_stage_paint+0x45 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0x164 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_paint+0x17b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_cogl_redraw+0x496 (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_update+0x117 (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_clock_dispatch+0x169 (/usr/lib64/libclutter-1.0.so.0.2400.2) g_main_context_dispatch+0x15a (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_context_iterate.isra.29+0x1e0 (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_loop_run+0xc2 (/usr/lib64/libglib-2.0.so.0.4600.2) meta_run+0x2c (/usr/lib64/libmutter.so.0.0.0) main+0x3f7 (/usr/bin/gnome-shell) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) [0x2909] (/usr/bin/gnome-shell) 363.086 ( 0.042 ms): gnome-shell/2287 poll(ufds: 0x7ffc5ea24250, nfds: 1, timeout_msecs: 4294967295 ) = 1 [0xf6fdd] (/usr/lib64/libc-2.22.so) _xcb_conn_wait+0x92 (/usr/lib64/libxcb.so.1.1.0) wait_for_reply+0xb7 (/usr/lib64/libxcb.so.1.1.0) xcb_wait_for_reply+0x61 (/usr/lib64/libxcb.so.1.1.0) _XReply+0x127 (/usr/lib64/libX11.so.6.3.0) XSync+0x4d (/usr/lib64/libX11.so.6.3.0) dri3_bind_tex_image+0x42 (/usr/lib64/libGL.so.1.2.0) _cogl_winsys_texture_pixmap_x11_update+0x117 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_update+0x67 (/usr/lib64/libcogl.so.20.4.1) _cogl_texture_pixmap_x11_pre_paint+0x13 (/usr/lib64/libcogl.so.20.4.1) _cogl_pipeline_layer_pre_paint+0x5e (/usr/lib64/libcogl.so.20.4.1) _cogl_rectangles_validate_layer_cb+0x1b (/usr/lib64/libcogl.so.20.4.1) cogl_pipeline_foreach_layer+0xbe (/usr/lib64/libcogl.so.20.4.1) _cogl_framebuffer_draw_multitextured_rectangles+0x77 (/usr/lib64/libcogl.so.20.4.1) cogl_framebuffer_draw_multitextured_rectangle+0x51 (/usr/lib64/libcogl.so.20.4.1) paint_clipped_rectangle+0xb6 (/usr/lib64/libmutter.so.0.0.0) meta_shaped_texture_paint+0x3e3 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_actor_paint+0x14b (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_real_paint+0x20 (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_window_group_paint+0x19f (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) [0x3d970] (/usr/lib64/gnome-shell/libgnome-shell.so) _g_closure_invoke_va+0xb2 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_paint+0x3a (/usr/lib64/libclutter-1.0.so.0.2400.2) meta_stage_paint+0x45 (/usr/lib64/libmutter.so.0.0.0) _g_closure_invoke_va+0x164 (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit_valist+0xc0d (/usr/lib64/libgobject-2.0.so.0.4600.2) g_signal_emit+0x8f (/usr/lib64/libgobject-2.0.so.0.4600.2) clutter_actor_continue_paint+0x2bb (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_actor_paint.part.41+0x47b (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_paint+0x17b (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_stage_cogl_redraw+0x496 (/usr/lib64/libclutter-1.0.so.0.2400.2) _clutter_stage_do_update+0x117 (/usr/lib64/libclutter-1.0.so.0.2400.2) clutter_clock_dispatch+0x169 (/usr/lib64/libclutter-1.0.so.0.2400.2) g_main_context_dispatch+0x15a (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_context_iterate.isra.29+0x1e0 (/usr/lib64/libglib-2.0.so.0.4600.2) g_main_loop_run+0xc2 (/usr/lib64/libglib-2.0.so.0.4600.2) meta_run+0x2c (/usr/lib64/libmutter.so.0.0.0) main+0x3f7 (/usr/bin/gnome-shell) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) [0x2909] (/usr/bin/gnome-shell) Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-jncuxju9fibq2rl6olhqwjw6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 4 ++ tools/perf/builtin-trace.c | 55 +++++++++++++++++-------- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 2ee0c4fee18d3..4e8baa75a32ea 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -138,6 +138,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. Default: 127 +--min-stack:: + Set the stack depth limit when parsing the callchain, anything + below the specified depth will be ignored. Disabled by default. + --proc-map-timeout:: When processing pre-existing threads /proc/XXX/mmap, it may take a long time, because the file may be huge. A time out is needed in such cases. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 65f6abe75d714..6a64cb1344c70 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -108,6 +108,7 @@ struct trace { proc_getname; } stats; unsigned int max_stack; + unsigned int min_stack; bool not_ev_qualifier; bool live; bool full_time; @@ -1849,7 +1850,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, goto out_put; } - if (!(trace->duration_filter || trace->summary_only)) + if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) trace__printf_interrupted_entry(trace, sample); ttrace->entry_time = sample->time; @@ -1860,7 +1861,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, args, trace, thread); if (sc->is_exit) { - if (!trace->duration_filter && !trace->summary_only) { + if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); fprintf(trace->output, "%-70s\n", ttrace->entry_str); } @@ -1880,26 +1881,26 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, return err; } -static int trace__fprintf_callchain(struct trace *trace, struct perf_evsel *evsel, - struct perf_sample *sample) +static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, + struct perf_sample *sample, + struct callchain_cursor *cursor) { struct addr_location al; + + if (machine__resolve(trace->host, &al, sample) < 0 || + thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) + return -1; + + return 0; +} + +static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) +{ /* TODO: user-configurable print_opts */ const unsigned int print_opts = EVSEL__PRINT_SYM | EVSEL__PRINT_DSO | EVSEL__PRINT_UNKNOWN_AS_ADDR; - if (sample->callchain == NULL) - return 0; - - if (machine__resolve(trace->host, &al, sample) < 0 || - thread__resolve_callchain(al.thread, &callchain_cursor, evsel, - sample, NULL, NULL, trace->max_stack)) { - pr_err("Problem processing %s callchain, skipping...\n", - perf_evsel__name(evsel)); - return 0; - } - return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); } @@ -1910,7 +1911,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, long ret; u64 duration = 0; struct thread *thread; - int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; + int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; struct syscall *sc = trace__syscall_info(trace, evsel, id); struct thread_trace *ttrace; @@ -1942,6 +1943,15 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, } else if (trace->duration_filter) goto out; + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out; + callchain_ret = 1; + } + } + if (trace->summary_only) goto out; @@ -1982,7 +1992,10 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, fputc('\n', trace->output); - trace__fprintf_callchain(trace, evsel, sample); + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: ttrace->entry_pending = false; err = 0; @@ -2131,7 +2144,10 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, fprintf(trace->output, ")\n"); - trace__fprintf_callchain(trace, evsel, sample); + if (sample->callchain) { + if (trace__resolve_callchain(trace, evsel, sample, &callchain_cursor) == 0) + trace__fprintf_callchain(trace, sample); + } return 0; } @@ -3082,6 +3098,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) &record_parse_callchain_opt), OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, "Show the kernel callchains on the syscall exit path"), + OPT_UINTEGER(0, "min-stack", &trace.min_stack, + "Set the minimum stack depth when parsing the callchain, " + "anything below the specified depth will be ignored."), OPT_UINTEGER(0, "max-stack", &trace.max_stack, "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " -- GitLab From 0883e820a0ac18e04f036dbebc3580351d7fd6cf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 16:37:17 -0300 Subject: [PATCH 265/705] perf record: Export record_opts based callchain parsing helper To be able to call it outside option parsing, like when setting a default --call-graph parameter in 'perf trace' when just --min-stack is used. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-xay69plylwibpb3l4isrpl1k@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 35 ++++++++++++++++++++--------------- tools/perf/util/callchain.h | 6 ++++++ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 3239a6ec9d230..5b4758a08a49b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -930,45 +930,50 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) return status; } -static void callchain_debug(void) +static void callchain_debug(struct callchain_param *callchain) { static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; - pr_debug("callchain: type %s\n", str[callchain_param.record_mode]); + pr_debug("callchain: type %s\n", str[callchain->record_mode]); - if (callchain_param.record_mode == CALLCHAIN_DWARF) + if (callchain->record_mode == CALLCHAIN_DWARF) pr_debug("callchain: stack dump size %d\n", - callchain_param.dump_size); + callchain->dump_size); } -int record_parse_callchain_opt(const struct option *opt, - const char *arg, - int unset) +int record_opts__parse_callchain(struct record_opts *record, + struct callchain_param *callchain, + const char *arg, bool unset) { int ret; - struct record_opts *record = (struct record_opts *)opt->value; - record->callgraph_set = true; - callchain_param.enabled = !unset; + callchain->enabled = !unset; /* --no-call-graph */ if (unset) { - callchain_param.record_mode = CALLCHAIN_NONE; + callchain->record_mode = CALLCHAIN_NONE; pr_debug("callchain: disabled\n"); return 0; } - ret = parse_callchain_record_opt(arg, &callchain_param); + ret = parse_callchain_record_opt(arg, callchain); if (!ret) { /* Enable data address sampling for DWARF unwind. */ - if (callchain_param.record_mode == CALLCHAIN_DWARF) + if (callchain->record_mode == CALLCHAIN_DWARF) record->sample_address = true; - callchain_debug(); + callchain_debug(callchain); } return ret; } +int record_parse_callchain_opt(const struct option *opt, + const char *arg, + int unset) +{ + return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); +} + int record_callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unset __maybe_unused) @@ -981,7 +986,7 @@ int record_callchain_opt(const struct option *opt, if (callchain_param.record_mode == CALLCHAIN_NONE) callchain_param.record_mode = CALLCHAIN_FP; - callchain_debug(); + callchain_debug(&callchain_param); return 0; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index cae5a7b1f5c8f..65e2a4f7cb4e8 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -212,6 +212,12 @@ struct hist_entry; int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset); int record_callchain_opt(const struct option *opt, const char *arg, int unset); +struct record_opts; + +int record_opts__parse_callchain(struct record_opts *record, + struct callchain_param *callchain, + const char *arg, bool unset); + int sample__resolve_callchain(struct perf_sample *sample, struct callchain_cursor *cursor, struct symbol **parent, struct perf_evsel *evsel, struct addr_location *al, -- GitLab From 056149932602ef905f1e26fc4fe242ef0533a597 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 16:41:19 -0300 Subject: [PATCH 266/705] perf trace: Make --(min,max}-stack imply "--call-graph dwarf" If one uses: # perf trace --min-stack 16 Then it implicitly means that callgraphs should be enabled, and the best option in terms of widespread availability is "dwarf". Further work needed to choose a better alternative, LBR, in capable systems. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-xtjmnpkyk42npekxz3kynzmx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 6 ++++++ tools/perf/builtin-trace.c | 13 ++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 4e8baa75a32ea..146c6db21cbf1 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -136,12 +136,18 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. not limiting, the overhead of callchains needs to be set via the knobs in --call-graph dwarf. + Implies '--call-graph dwarf' when --call-graph not present on the + command line, on systems where DWARF unwinding was built in. + Default: 127 --min-stack:: Set the stack depth limit when parsing the callchain, anything below the specified depth will be ignored. Disabled by default. + Implies '--call-graph dwarf' when --call-graph not present on the + command line, on systems where DWARF unwinding was built in. + --proc-map-timeout:: When processing pre-existing threads /proc/XXX/mmap, it may take a long time, because the file may be huge. A time out is needed in such cases. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 6a64cb1344c70..19f5100acc1d4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3047,7 +3047,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .show_comm = true, .trace_syscalls = true, .kernel_syscallchains = false, - .max_stack = PERF_MAX_STACK_DEPTH, + .max_stack = UINT_MAX, }; const char *output_name = NULL; const char *ev_qualifier_str = NULL; @@ -3109,6 +3109,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) "per thread proc mmap processing timeout in ms"), OPT_END() }; + bool max_stack_user_set = true; const char * const trace_subcommands[] = { "record", NULL }; int err; char bf[BUFSIZ]; @@ -3142,6 +3143,16 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) trace.opts.sample_time = true; } + if (trace.max_stack == UINT_MAX) { + trace.max_stack = PERF_MAX_STACK_DEPTH; + max_stack_user_set = false; + } + +#ifdef HAVE_DWARF_UNWIND_SUPPORT + if ((trace.min_stack || max_stack_user_set) && !trace.opts.callgraph_set) + record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); +#endif + if (trace.opts.callgraph_set) symbol_conf.use_callchain = true; -- GitLab From f5e7150cd9a7779a54b192d21afb9245384db8bc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 17:46:31 -0300 Subject: [PATCH 267/705] perf evlist: Expose perf_event_mlock_kb_in_pages() helper When the user doesn't set --mmap-pages, perf_evlist__mmap() will do it by reading the maximum possible for a non-root user from the /proc/sys/kernel/perf_event_mlock_kb file. Expose that function so that 'perf trace' can, for root users, to bump mmap-pages to a higher value for root, based on the contents of this proc file. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-xay69plylwibpb3l4isrpl1k@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 42 ++++++++++++++++++++++++---------------- tools/perf/util/evlist.h | 2 ++ 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4c9f510ae18da..6fb5725821de7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -986,26 +986,34 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, return -1; } -static size_t perf_evlist__mmap_size(unsigned long pages) +unsigned long perf_event_mlock_kb_in_pages(void) { - if (pages == UINT_MAX) { - int max; + unsigned long pages; + int max; - if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) { - /* - * Pick a once upon a time good value, i.e. things look - * strange since we can't read a sysctl value, but lets not - * die yet... - */ - max = 512; - } else { - max -= (page_size / 1024); - } + if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) { + /* + * Pick a once upon a time good value, i.e. things look + * strange since we can't read a sysctl value, but lets not + * die yet... + */ + max = 512; + } else { + max -= (page_size / 1024); + } + + pages = (max * 1024) / page_size; + if (!is_power_of_2(pages)) + pages = rounddown_pow_of_two(pages); - pages = (max * 1024) / page_size; - if (!is_power_of_2(pages)) - pages = rounddown_pow_of_two(pages); - } else if (!is_power_of_2(pages)) + return pages; +} + +static size_t perf_evlist__mmap_size(unsigned long pages) +{ + if (pages == UINT_MAX) + pages = perf_event_mlock_kb_in_pages(); + else if (!is_power_of_2(pages)) return 0; return (pages + 1) * page_size; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index da46423998e8c..208897a646cae 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -158,6 +158,8 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str, int unset); +unsigned long perf_event_mlock_kb_in_pages(void); + int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages, bool overwrite, unsigned int auxtrace_pages, bool auxtrace_overwrite); -- GitLab From f3e459d16a8493b617ccf2a940330279679e0291 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Apr 2016 17:52:34 -0300 Subject: [PATCH 268/705] perf trace: Bump --mmap-pages when --call-graph is used by the root user To reduce the chances we'll overflow the mmap buffer, manual fine tuning trumps this. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wxygbxmp1v9mng1ea28wet02@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-trace.txt | 4 ++++ tools/perf/builtin-trace.c | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 146c6db21cbf1..c075c002eaa40 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -123,6 +123,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. man pages for details. The ones that are most useful in 'perf trace' are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'. + Using this will, for the root user, bump the value of --mmap-pages to 4 + times the maximum for non-root users, based on the kernel.perf_event_mlock_kb + sysctl. This is done only if the user doesn't specify a --mmap-pages value. + --kernel-syscall-graph:: Show the kernel callchains on the syscall exit path. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 19f5100acc1d4..026ec0c749b04 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3110,6 +3110,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) OPT_END() }; bool max_stack_user_set = true; + bool mmap_pages_user_set = true; const char * const trace_subcommands[] = { "record", NULL }; int err; char bf[BUFSIZ]; @@ -3143,6 +3144,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) trace.opts.sample_time = true; } + if (trace.opts.mmap_pages == UINT_MAX) + mmap_pages_user_set = false; + if (trace.max_stack == UINT_MAX) { trace.max_stack = PERF_MAX_STACK_DEPTH; max_stack_user_set = false; @@ -3153,8 +3157,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); #endif - if (trace.opts.callgraph_set) + if (trace.opts.callgraph_set) { + if (!mmap_pages_user_set && geteuid() == 0) + trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; + symbol_conf.use_callchain = true; + } if (trace.evlist->nr_entries > 0) evlist__set_evsel_handler(trace.evlist, trace__event_handler); -- GitLab From a3819e3e71d5000c176918309284a1fa2f133fcf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 Apr 2016 19:00:26 +0200 Subject: [PATCH 269/705] x86: Fix non-static inlines Four instances of incorrect usage of non-static "inline" crept up in arch/x86, all trivial; cleaning them up: EVT_TO_HPET_DEV() - made static, it is only used in kernel/hpet.c Debug version of check_iommu_entries() is an __init function. Non-debug dummy empty version of it is declared "inline" instead - which doesn't help to eliminate it (the caller is in a different unit, inlining doesn't happen). Switch to non-inlined __init function, which does eliminate it (by discarding it as part of __init section). crypto/sha-mb/sha1_mb.c: looks like they just forgot to add "static" to their two internal inlines, which emitted two unused functions into vmlinux. text data bss dec hex filename 95903394 20860288 35991552 152755234 91adc22 vmlinux_before 95903266 20860288 35991552 152755106 91adba2 vmlinux Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1460739626-12179-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/crypto/sha-mb/sha1_mb.c | 4 ++-- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/pci-iommu_table.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index a8a0224fa0f8a..fb9c7a84700c8 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -102,14 +102,14 @@ static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *st static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); -inline void sha1_init_digest(uint32_t *digest) +static inline void sha1_init_digest(uint32_t *digest) { static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }; memcpy(digest, initial_digest, sizeof(initial_digest)); } -inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], +static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a1f0e4a5c47e3..130f2b4b8ecb8 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -54,7 +54,7 @@ struct hpet_dev { char name[10]; }; -inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) { return container_of(evtdev, struct hpet_dev, evt); } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 35ccf75696eb8..f712dfdf1357f 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -72,7 +72,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start, } } #else -inline void check_iommu_entries(struct iommu_table_entry *start, +void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { } -- GitLab From ccd62a896ffe3dbd60f3b7570a2b74e4fe030ed6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 16 Apr 2016 09:36:32 -0300 Subject: [PATCH 270/705] perf trace: Fix build when DWARF unwind isn't available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable is initialized and then conditionally set to a different value, but not used when DWARF unwinding is not available, bummer, write 1000 times: "Run make -C tools/perf build-test"... builtin-trace.c: In function ‘cmd_trace’: builtin-trace.c:3112:6: error: variable ‘max_stack_user_set’ set but not used [-Werror=unused-but-set-variable] bool max_stack_user_set = true; ^ cc1: all warnings being treated as err Fix it by marking it as __maybe_unused. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Fixes: 056149932602 ("perf trace: Make --(min,max}-stack imply "--call-graph dwarf"") Link: http://lkml.kernel.org/n/tip-85r40c5hhv6jnmph77l1hgsr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 026ec0c749b04..0e3c1cecef1b1 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3109,7 +3109,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) "per thread proc mmap processing timeout in ms"), OPT_END() }; - bool max_stack_user_set = true; + bool __maybe_unused max_stack_user_set = true; bool mmap_pages_user_set = true; const char * const trace_subcommands[] = { "record", NULL }; int err; -- GitLab From acf2abbd0b7fcc6325e9690a8a32ee924c827f70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 10:35:03 -0300 Subject: [PATCH 271/705] perf evsel: Add missign class prefix to has_branch_stack method Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-5i07ivw1yjsweb7gztr255jd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 2 +- tools/perf/util/machine.c | 2 +- tools/perf/util/session.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b993218744d46..8a644fef452c0 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -420,7 +420,7 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \ (_evsel) && (_evsel)->leader == (_leader); \ (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node)) -static inline bool has_branch_callstack(struct perf_evsel *evsel) +static inline bool perf_evsel__has_branch_callstack(const struct perf_evsel *evsel) { return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 0c4dabc699329..52b51e004fe8f 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1808,7 +1808,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, callchain_cursor_reset(cursor); - if (has_branch_callstack(evsel)) { + if (perf_evsel__has_branch_callstack(evsel)) { err = resolve_lbr_callchain_sample(thread, cursor, sample, parent, root_al, max_stack); if (err) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ca1827c4af4a4..2335b2824d8af 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -907,7 +907,7 @@ static void callchain__printf(struct perf_evsel *evsel, unsigned int i; struct ip_callchain *callchain = sample->callchain; - if (has_branch_callstack(evsel)) + if (perf_evsel__has_branch_callstack(evsel)) callchain__lbr_callstack_printf(sample); printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); @@ -1081,7 +1081,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_CALLCHAIN) callchain__printf(evsel, sample); - if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel)) + if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !perf_evsel__has_branch_callstack(evsel)) branch_stack__printf(sample); if (sample_type & PERF_SAMPLE_REGS_USER) -- GitLab From 922315210b8007a26374e30712813b714af71cac Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 11:31:46 -0300 Subject: [PATCH 272/705] perf script: Check sample->callchain before using it Found by code inspection, while looking at thread__resolve_callchain() callsites, one had it, the other didn't. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-6r8i2afd3523thuuaxl39yhk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 0e93282b405ed..5099740aa50bc 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -791,7 +791,7 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(IP)) { struct callchain_cursor *cursor = NULL, cursor_callchain; - if (symbol_conf.use_callchain && + if (symbol_conf.use_callchain && sample->callchain && thread__resolve_callchain(al->thread, &cursor_callchain, evsel, sample, NULL, NULL, scripting_max_stack) == 0) cursor = &cursor_callchain; -- GitLab From 30234f0925c1deeb472b579b57a9f50791157c58 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 11:53:07 -0300 Subject: [PATCH 273/705] perf callchain: Set callchain_param.enabled when parsing --call-graph Trying to move in the direction of using callchain_param for all callchain parameters, eventually ditching them from symbol_conf. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-kixllia6r26mz45ng056zq7z@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 2b4ceaf058bb3..aa248dcb44406 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -109,6 +109,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt) bool record_opt_set = false; bool try_stack_size = false; + callchain_param.enabled = true; symbol_conf.use_callchain = true; if (!arg) @@ -117,6 +118,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt) while ((tok = strtok((char *)arg, ",")) != NULL) { if (!strncmp(tok, "none", strlen(tok))) { callchain_param.mode = CHAIN_NONE; + callchain_param.enabled = false; symbol_conf.use_callchain = false; return 0; } -- GitLab From 1cc83815d5fdb40a7d06c3f9871134a10e5ffa79 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 11:54:31 -0300 Subject: [PATCH 274/705] perf report: Use callchain_param.enabled instead of tool specific knob We have callchain_param.enabled, so no need to have something just for 'perf report' to do the same thing. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wbeisubpualwogwi5u8utnt1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 160ea23b45aaf..1d5be0bd426f7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -47,7 +47,6 @@ struct report { struct perf_tool tool; struct perf_session *session; bool use_tui, use_gtk, use_stdio; - bool dont_use_callchains; bool show_full_info; bool show_threads; bool inverted_callchain; @@ -247,7 +246,7 @@ static int report__setup_sample_type(struct report *rep) "you call 'perf record' without -g?\n"); return -1; } - } else if (!rep->dont_use_callchains && + } else if (!callchain_param.enabled && callchain_param.mode != CHAIN_NONE && !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; @@ -599,13 +598,15 @@ static int __cmd_report(struct report *rep) static int report_parse_callchain_opt(const struct option *opt, const char *arg, int unset) { - struct report *rep = (struct report *)opt->value; + struct callchain_param *callchain = opt->value; + callchain->enabled = !unset; /* * --no-call-graph */ if (unset) { - rep->dont_use_callchains = true; + symbol_conf.use_callchain = false; + callchain->mode = CHAIN_NONE; return 0; } @@ -734,7 +735,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) "regex filter to identify parent, see: '--sort parent'"), OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, "Only display entries with parent-match"), - OPT_CALLBACK_DEFAULT('g', "call-graph", &report, + OPT_CALLBACK_DEFAULT('g', "call-graph", &callchain_param, "print_type,threshold[,print_limit],order,sort_key[,branch],value", report_callchain_help, &report_parse_callchain_opt, callchain_default_opt), -- GitLab From 2ddd5c049e71dd8551c268e7386fefeb7495e988 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 12:09:08 -0300 Subject: [PATCH 275/705] perf tools: Ditch record_opts.callgraph_set We have callchain_param.enabled for that. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-silwqjc2t25ls42dsvg28pp5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 14 ++++++-------- tools/perf/builtin-top.c | 13 ++++++------- tools/perf/builtin-trace.c | 8 ++++---- tools/perf/perf.h | 1 - 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 5b4758a08a49b..bd9593346bb2f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -946,7 +946,6 @@ int record_opts__parse_callchain(struct record_opts *record, const char *arg, bool unset) { int ret; - record->callgraph_set = true; callchain->enabled = !unset; /* --no-call-graph */ @@ -978,15 +977,14 @@ int record_callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unset __maybe_unused) { - struct record_opts *record = (struct record_opts *)opt->value; + struct callchain_param *callchain = opt->value; - record->callgraph_set = true; - callchain_param.enabled = true; + callchain->enabled = true; - if (callchain_param.record_mode == CALLCHAIN_NONE) - callchain_param.record_mode = CALLCHAIN_FP; + if (callchain->record_mode == CALLCHAIN_NONE) + callchain->record_mode = CALLCHAIN_FP; - callchain_debug(&callchain_param); + callchain_debug(callchain); return 0; } @@ -1224,7 +1222,7 @@ struct option __record_options[] = { record__parse_mmap_pages), OPT_BOOLEAN(0, "group", &record.opts.group, "put the counters into a counter group"), - OPT_CALLBACK_NOOPT('g', NULL, &record.opts, + OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, NULL, "enables call-graph recording" , &record_callchain_opt), OPT_CALLBACK(0, "call-graph", &record.opts, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 8846df0ec0c3f..f0cfdf394fac1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1045,18 +1045,17 @@ callchain_opt(const struct option *opt, const char *arg, int unset) static int parse_callchain_opt(const struct option *opt, const char *arg, int unset) { - struct record_opts *record = (struct record_opts *)opt->value; + struct callchain_param *callchain = opt->value; - record->callgraph_set = true; - callchain_param.enabled = !unset; - callchain_param.record_mode = CALLCHAIN_FP; + callchain->enabled = !unset; + callchain->record_mode = CALLCHAIN_FP; /* * --no-call-graph */ if (unset) { symbol_conf.use_callchain = false; - callchain_param.record_mode = CALLCHAIN_NONE; + callchain->record_mode = CALLCHAIN_NONE; return 0; } @@ -1162,10 +1161,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "output field(s): overhead, period, sample plus all of sort keys"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts, + OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, NULL, "enables call-graph recording and display", &callchain_opt), - OPT_CALLBACK(0, "call-graph", &top.record_opts, + OPT_CALLBACK(0, "call-graph", &callchain_param, "record_mode[,record_size],print_type,threshold[,print_limit],order,sort_key[,branch]", top_callchain_help, &parse_callchain_opt), OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0e3c1cecef1b1..5e2614bbb48da 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2457,7 +2457,7 @@ static int trace__add_syscall_newtp(struct trace *trace) perf_evlist__add(evlist, sys_enter); perf_evlist__add(evlist, sys_exit); - if (trace->opts.callgraph_set && !trace->kernel_syscallchains) { + if (callchain_param.enabled && !trace->kernel_syscallchains) { /* * We're interested only in the user space callchain * leading to the syscall, allow overriding that for @@ -2546,7 +2546,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) perf_evlist__config(evlist, &trace->opts, NULL); - if (trace->opts.callgraph_set && trace->syscalls.events.sys_exit) { + if (callchain_param.enabled && trace->syscalls.events.sys_exit) { perf_evsel__config_callchain(trace->syscalls.events.sys_exit, &trace->opts, &callchain_param); /* @@ -3153,11 +3153,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) } #ifdef HAVE_DWARF_UNWIND_SUPPORT - if ((trace.min_stack || max_stack_user_set) && !trace.opts.callgraph_set) + if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); #endif - if (trace.opts.callgraph_set) { + if (callchain_param.enabled) { if (!mmap_pages_user_set && geteuid() == 0) trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 5381a01c0610c..cd8f1b150f9ec 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,7 +52,6 @@ struct record_opts { bool sample_weight; bool sample_time; bool sample_time_set; - bool callgraph_set; bool period; bool running_time; bool full_auxtrace; -- GitLab From 1b6b678ecfb73724914a8b12d57909a4c514a9bd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 12:24:41 -0300 Subject: [PATCH 276/705] perf hists browser: Fold two consecutive symbol_conf.use_callchain ifs Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-u701i6qpecgm9jiat52i8l98@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/hists.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index e70df2e54d667..6a4681932ba57 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1896,11 +1896,10 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser, bool first = true; int ret; - if (symbol_conf.use_callchain) + if (symbol_conf.use_callchain) { folded_sign = hist_entry__folded(he); - - if (symbol_conf.use_callchain) printed += fprintf(fp, "%c ", folded_sign); + } hists__for_each_format(browser->hists, fmt) { if (perf_hpp__should_skip(fmt, he->hists)) -- GitLab From e3815264a6c57147f8b5639536b1df3c98244642 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 18 Apr 2016 12:30:16 -0300 Subject: [PATCH 277/705] perf top: Use callchain_param.enabled instead of symbol_conf.use_callchain One more step in the direction of using just callchain_param for callchain parameters. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-3b1o9kb2dc94zldz0klckti6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f0cfdf394fac1..c130a11d3a0d7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -917,15 +917,15 @@ static int perf_top__start_counters(struct perf_top *top) return -1; } -static int perf_top__setup_sample_type(struct perf_top *top __maybe_unused) +static int callchain_param__setup_sample_type(struct callchain_param *callchain) { if (!sort__has_sym) { - if (symbol_conf.use_callchain) { + if (callchain->enabled) { ui__error("Selected -g but \"sym\" not present in --sort/-s."); return -EINVAL; } - } else if (callchain_param.mode != CHAIN_NONE) { - if (callchain_register_param(&callchain_param) < 0) { + } else if (callchain->mode != CHAIN_NONE) { + if (callchain_register_param(callchain) < 0) { ui__error("Can't register callchain params.\n"); return -EINVAL; } @@ -952,7 +952,7 @@ static int __cmd_top(struct perf_top *top) goto out_delete; } - ret = perf_top__setup_sample_type(top); + ret = callchain_param__setup_sample_type(&callchain_param); if (ret) goto out_delete; @@ -1311,7 +1311,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) top.sym_evsel = perf_evlist__first(top.evlist); - if (!symbol_conf.use_callchain) { + if (!callchain_param.enabled) { symbol_conf.cumulate_callchain = false; perf_hpp__cancel_cumulate(); } -- GitLab From 9b238748cb6e9fadab0e761f6d30ba311b4ac470 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:10 -0700 Subject: [PATCH 278/705] x86/KASLR: Rename aslr.c to kaslr.c In order to avoid confusion over what this file provides, rename it to kaslr.c since it is used exclusively for the kernel ASLR, not userspace ASLR. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/{aslr.c => kaslr.c} | 0 arch/x86/boot/compressed/misc.h | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename arch/x86/boot/compressed/{aslr.c => kaslr.c} (100%) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8774cb23064fe..542c92f5ca4f3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -62,7 +62,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o -vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/kaslr.c similarity index 100% rename from arch/x86/boot/compressed/aslr.c rename to arch/x86/boot/compressed/kaslr.c diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3783dc3e10b31..a8c4e087e14d4 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -66,7 +66,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE -/* aslr.c */ +/* kaslr.c */ unsigned char *choose_kernel_location(struct boot_params *boot_params, unsigned char *input, unsigned long input_size, -- GitLab From 206f25a8319b312b9983953a308b0e38e1943c1c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 18 Apr 2016 09:42:11 -0700 Subject: [PATCH 279/705] x86/KASLR: Remove unneeded boot_params argument Since the boot_params can be found using the real_mode global variable, there is no need to pass around a pointer to it. This slightly simplifies the choose_kernel_location function and its callers. [kees: rewrote changelog, tracked file rename] Signed-off-by: Yinghai Lu Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460997735-24785-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 5 ++--- arch/x86/boot/compressed/misc.c | 2 +- arch/x86/boot/compressed/misc.h | 6 ++---- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 6a9b96b4624d2..622aa881c6ab1 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -295,8 +295,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -316,7 +315,7 @@ unsigned char *choose_kernel_location(struct boot_params *boot_params, } #endif - boot_params->hdr.loadflags |= KASLR_FLAG; + real_mode->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac1758e7c0..f35ad9eb1bf1d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -428,7 +428,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, output, + output = choose_kernel_location(input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a8c4e087e14d4..22346d5a2fa0c 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,8 +67,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -76,8 +75,7 @@ unsigned char *choose_kernel_location(struct boot_params *boot_params, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) -- GitLab From 6655e0aaf768c39a62eea739c453b9db1e841cfb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:12 -0700 Subject: [PATCH 280/705] x86/boot: Rename "real_mode" to "boot_params" The non-compressed boot code uses the (much more obvious) name "boot_params" for the global pointer to the x86 boot parameters. The compressed kernel loader code, though, was using the legacy name "real_mode". There is no need to have a different name, and changing it improves readability. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/cmdline.c | 4 ++-- arch/x86/boot/compressed/kaslr.c | 22 +++++++++++----------- arch/x86/boot/compressed/misc.c | 27 ++++++++++++++------------- arch/x86/boot/compressed/misc.h | 2 +- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b68e3033e6b9b..73ccf63b0f48c 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" static unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 622aa881c6ab1..a51ec841c9b93 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -55,7 +55,7 @@ static unsigned long get_random_boot(void) unsigned long hash = 0; hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); + hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); return hash; } @@ -152,16 +152,16 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[0].size = unsafe_len; /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; + initrd_start = (u64)boot_params->ext_ramdisk_image << 32; + initrd_start |= boot_params->hdr.ramdisk_image; + initrd_size = (u64)boot_params->ext_ramdisk_size << 32; + initrd_size |= boot_params->hdr.ramdisk_size; mem_avoid[1].start = initrd_start; mem_avoid[1].size = initrd_size; /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; + cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line |= boot_params->hdr.cmd_line_ptr; /* Calculate size of cmd_line. */ ptr = (char *)(unsigned long)cmd_line; for (cmd_line_size = 0; ptr[cmd_line_size++]; ) @@ -190,7 +190,7 @@ static bool mem_avoid_overlap(struct mem_vector *img) } /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; + ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; while (ptr) { struct mem_vector avoid; @@ -288,8 +288,8 @@ static unsigned long find_random_addr(unsigned long minimum, minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); + for (i = 0; i < boot_params->e820_entries; i++) { + process_e820_entry(&boot_params->e820_map[i], minimum, size); } return slots_fetch_random(); @@ -315,7 +315,7 @@ unsigned char *choose_kernel_location(unsigned char *input, } #endif - real_mode->hdr.loadflags |= KASLR_FLAG; + boot_params->hdr.loadflags |= KASLR_FLAG; /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f35ad9eb1bf1d..462dfbf7467be 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -114,7 +114,7 @@ static void error(char *m); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params; memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -184,12 +184,12 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && + if (boot_params->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params->screen_info.orig_x; + y = boot_params->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -210,8 +210,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params->screen_info.orig_x = x; + boot_params->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -392,14 +392,15 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, { unsigned char *output_orig = output; - real_mode = rmode; + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params = rmode; - /* Clear it for solely in-kernel use */ - real_mode->hdr.loadflags &= ~KASLR_FLAG; + /* Clear flags intended for solely in-kernel use. */ + boot_params->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + sanitize_boot_params(boot_params); - if (real_mode->screen_info.orig_video_mode == 7) { + if (boot_params->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -407,8 +408,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params->screen_info.orig_video_lines; + cols = boot_params->screen_info.orig_video_cols; console_init(); debug_putstr("early console in decompress_kernel\n"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 22346d5a2fa0c..1f750a516580b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -32,7 +32,7 @@ /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) -- GitLab From c04028813221c2d39a4f368586795ac4466d311c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:13 -0700 Subject: [PATCH 281/705] x86/boot: Clarify purpose of functions in misc.c The function "decompress_kernel" now performs many more duties, so this patch renames it to "extract_kernel" and updates callers and comments. Additionally the file header comment for misc.c is improved to actually describe what is contained. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_32.S | 8 ++++---- arch/x86/boot/compressed/head_64.S | 4 ++-- arch/x86/boot/compressed/misc.c | 10 ++++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064da8da3..26dd9df19a698 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -233,9 +233,9 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - /* push arguments for decompress_kernel: */ + /* push arguments for extract_kernel: */ pushl $z_run_size /* size of kernel with .bss and .brk */ pushl $z_output_len /* decompressed length, end of relocs */ leal z_extract_offset_negative(%ebx), %ebp @@ -246,11 +246,11 @@ relocated: leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel /* returns kernel location in %eax */ + call extract_kernel /* returns kernel location in %eax */ addl $28, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a1991393..d43c30ed89ed1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -408,7 +408,7 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ @@ -419,7 +419,7 @@ relocated: movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ - call decompress_kernel /* returns kernel location in %rax */ + call extract_kernel /* returns kernel location in %rax */ popq %r9 popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 462dfbf7467be..0d69e809673ab 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,10 @@ /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -383,7 +385,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, @@ -412,7 +414,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, cols = boot_params->screen_info.orig_video_cols; console_init(); - debug_putstr("early console in decompress_kernel\n"); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; -- GitLab From 7de828dfe607013546ece7ce25aa9839e8f93a66 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:14 -0700 Subject: [PATCH 282/705] x86/KASLR: Clarify purpose of kaslr.c The name "choose_kernel_location" isn't specific enough, and doesn't describe the primary thing it does: choosing a random location. This patch renames it to "choose_random_location", and clarifies the what routines are contained in the kaslr.c source file. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 13 ++++++++++++- arch/x86/boot/compressed/misc.c | 2 +- arch/x86/boot/compressed/misc.h | 4 ++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a51ec841c9b93..9e03190d00ad4 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -1,3 +1,14 @@ +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ #include "misc.h" #include @@ -295,7 +306,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0d69e809673ab..ad8c01ac28854 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -431,7 +431,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(input_data, input_len, output, + output = choose_random_location(input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 1f750a516580b..9887e0d4aaeb9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -67,7 +67,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -75,7 +75,7 @@ unsigned char *choose_kernel_location(unsigned char *input, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_random_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) -- GitLab From 9016875df408fc5db6a94a3c5f5f5503c916cf81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Apr 2016 09:42:15 -0700 Subject: [PATCH 283/705] x86/KASLR: Rename "random" to "random_addr" The variable "random" is also the name of a libc function. It's better coding style to avoid overloading such things, so rename it to the more accurate "random_addr". Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1460997735-24785-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9e03190d00ad4..9c29e7885ef09 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -312,7 +312,7 @@ unsigned char *choose_random_location(unsigned char *input, unsigned long output_size) { unsigned long choice = (unsigned long)output; - unsigned long random; + unsigned long random_addr; #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { @@ -333,17 +333,17 @@ unsigned char *choose_random_location(unsigned char *input, (unsigned long)output, output_size); /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { + random_addr = find_random_addr(choice, output_size); + if (!random_addr) { debug_putstr("KASLR could not find suitable E820 region...\n"); goto out; } /* Always enforce the minimum. */ - if (random < choice) + if (random_addr < choice) goto out; - choice = random; + choice = random_addr; out: return (unsigned char *)choice; } -- GitLab From 7a09b225f31031f8cac9e7801b6004e79f8b0da1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 19 Apr 2016 11:21:15 +0300 Subject: [PATCH 284/705] x86/build/defconfig/64: Enable CONFIG_E1000E=y Very common ethernet. Already enabled in i386_defconfig Signed-off-by: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/146105407523.18740.6392078851674393377.stgit@zurg Signed-off-by: Ingo Molnar --- arch/x86/configs/x86_64_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4f404a64681b8..0c8d7963483ce 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -173,6 +173,7 @@ CONFIG_TIGON3=y CONFIG_NET_TULIP=y CONFIG_E100=y CONFIG_E1000=y +CONFIG_E1000E=y CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y -- GitLab From abfb9498ee1327f534df92a7ecaea81a85913bae Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 18 Apr 2016 16:43:43 +0300 Subject: [PATCH 285/705] x86/entry: Rename is_{ia32,x32}_task() to in_{ia32,x32}_syscall() The is_ia32_task()/is_x32_task() function names are a big misnomer: they suggests that the compat-ness of a system call is a task property, which is not true, the compatness of a system call purely depends on how it was invoked through the system call layer. A task may call 32-bit and 64-bit and x32 system calls without changing any of its kernel visible state. This specific minomer is also actively dangerous, as it might cause kernel developers to use the wrong kind of security checks within system calls. So rename it to in_{ia32,x32}_syscall(). Suggested-by: Andy Lutomirski Suggested-by: Ingo Molnar Signed-off-by: Dmitry Safonov [ Expanded the changelog. ] Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1460987025-30360-1-git-send-email-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 2 +- arch/x86/include/asm/compat.h | 4 ++-- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 2 +- arch/x86/kernel/uprobes.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e79d93d44ecd9..ec138e538c44f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -191,7 +191,7 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, long syscall_trace_enter(struct pt_regs *regs) { - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); if (phase1_result == 0) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index ebb102e1bbc7a..5a3b2c119ed0e 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -307,7 +307,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)round_down(sp - len, 16); } -static inline bool is_x32_task(void) +static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) @@ -318,7 +318,7 @@ static inline bool is_x32_task(void) static inline bool in_compat_syscall(void) { - return is_ia32_task() || is_x32_task(); + return in_ia32_syscall() || in_x32_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ffae84df8a931..30c133ac05cd8 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -255,7 +255,7 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -static inline bool is_ia32_task(void) +static inline bool in_ia32_syscall(void) { #ifdef CONFIG_X86_32 return true; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 50337eac1ca2d..24d1b7fb4399e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -210,7 +210,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (is_ia32_task()) + if (in_ia32_syscall()) err = do_set_thread_area(p, -1, (struct user_desc __user *)tls, 0); else diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 32e9d9cbb884a..0f4d2a5df2dc2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1266,7 +1266,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI - if (!is_ia32_task()) + if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6408c09bbcd4a..2ebcc60f0e140 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -762,7 +762,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_X86_64 - if (is_ia32_task()) + if (in_ia32_syscall()) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index bf4db6eaec8fd..98b4dc87628b1 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -516,7 +516,7 @@ struct uprobe_xol_ops { static inline int sizeof_long(void) { - return is_ia32_task() ? 4 : 8; + return in_ia32_syscall() ? 4 : 8; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) -- GitLab From f454bfddf6ba557381d8bf5df50eff778602ff23 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 14 Apr 2016 14:59:49 +0300 Subject: [PATCH 286/705] perf/core, sched: Don't use clock function pointer to determine clock Now that local_clock() is explicitly inlined in sched.h, taking its pointer would uninline it in the compilation unit where it's done, making (among other things) comparing pointers to this function produce different results in different compilation units. Case in point, x86 perf core's user page updating function compares event's clock against &local_clock to see if it needs to set zero time offset related bits in the page. This patch fixes the latter by looking at the "use_clockid" event attribute instead, to determine whether local clock is used. Fixing the uninlined local_clock() in perf core is left as an exercise for the author of the prior work. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: vince@deater.net Fixes: http://lkml.kernel.org/r/1459541050-13654-1-git-send-email-daniel.lezcano@linaro.org Link: http://lkml.kernel.org/r/1460635189-2320-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e280..dd39fde66b54b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2177,7 +2177,7 @@ void arch_perf_update_userpage(struct perf_event *event, * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ - if (event->clock == &local_clock) { + if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = data->cyc2ns_offset; } -- GitLab From d6632dd59b66c89724ef28e2723586d1429382aa Mon Sep 17 00:00:00 2001 From: Chris Phlipot Date: Tue, 19 Apr 2016 01:56:02 -0700 Subject: [PATCH 287/705] perf script: Fix postgresql ubuntu install instructions The current instructions for setting up an Ubuntu system for using the export-to-postgresql.py script are incorrect. The instructions in the script have been updated to work on newer versions of ubuntu. -Add missing dependencies to apt-get command: python-pyside.qtsql, libqt4-sql-psql -Add '-s' option to createuser command to force the user to be a superuser since the command doesn't prompt as indicated in the current instructions. Tested on: Ubuntu 14.04, Ubuntu 16.04(beta) Signed-off-by: Chris Phlipot Cc: Adrian Hunter Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1461056164-14914-3-git-send-email-cphlipot0@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/export-to-postgresql.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 1b02cdc0cab69..6f0ca6873c17a 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -34,10 +34,9 @@ import datetime # # ubuntu: # -# $ sudo apt-get install postgresql +# $ sudo apt-get install postgresql python-pyside.qtsql libqt4-sql-psql # $ sudo su - postgres -# $ createuser -# Shall the new role be a superuser? (y/n) y +# $ createuser -s # # An example of using this script with Intel PT: # -- GitLab From f56ebf20d0f535f5da7cfcf0000ab3e0af133f81 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Apr 2016 00:07:18 +0100 Subject: [PATCH 288/705] perf jit: memset() variable 'st' using the correct size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current code is memsetting the 'struct stat' variable 'st' with the size of 'stat' (which turns out to be 1 byte) rather than the size of variable 'sz'. Committer notes: sizeof(function) isn't valid, the result depends on the compiler used, with gcc, enabling pedantic warnings we get: $ cat sizeof_function.c #include #include #include #include int main(void) { printf("sizeof(stat)=%zd, stat=%p\n", sizeof(stat), stat); return 0; } $ readelf -sW sizeof_function | grep -w stat 49: 0000000000400630 16 FUNC WEAK HIDDEN 13 stat $ cc -pedantic sizeof_function.c -o sizeof_function sizeof_function.c: In function ‘main’: sizeof_function.c:8:46: warning: invalid application of ‘sizeof’ to a function type [-Wpointer-arith] printf("sizeof(stat)=%zd, stat=%p\n", sizeof(stat), stat); ^ $ ./sizeof_function sizeof(stat)=1, stat=0x400630 $ Standard C, section 6.5.3.4: "The sizeof operator shall not be applied to an expression that has function type or an incomplete type, to the parenthesized name of such a type, or to an expression that designates a bit-field member." http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf Signed-off-by: Colin Ian King Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: Stephane Eranian Fixes: 9b07e27f88b9 ("perf inject: Add jitdump mmap injection support") Link: http://lkml.kernel.org/r/1461020838-9260-1-git-send-email-colin.king@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/jitdump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 52fcef3074fee..86afe9618bb0d 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -412,7 +412,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) return -1; } if (stat(filename, &st)) - memset(&st, 0, sizeof(stat)); + memset(&st, 0, sizeof(st)); event->mmap2.header.type = PERF_RECORD_MMAP2; event->mmap2.header.misc = PERF_RECORD_MISC_USER; @@ -500,7 +500,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) size++; /* for \0 */ if (stat(filename, &st)) - memset(&st, 0, sizeof(stat)); + memset(&st, 0, sizeof(st)); size = PERF_ALIGN(size, sizeof(u64)); -- GitLab From 2cc4666927402ec748122cac15ceac35a5e298a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 12:01:51 -0300 Subject: [PATCH 289/705] perf build: Remove x86 references from arch-neutral Build It will already be dealt with generating the syscalltbl.c file in the x86 arch specific Build files, namely via 'archheaders'. This fixes the build on !x86 arches, as reported for powerpcle Reported-by: Stephen Rothwell Tested-by: Jiri Olsa Cc: Adrian Hunter Cc: David Ahern Cc: "H. Peter Anvin" Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wang Nan Fixes: 1b700c997500 ("perf tools: Build syscall table .c header from kernel's syscall_64.tbl") Link: http://lkml.kernel.org/r/20160415212831.GT9056@kernel.org [ Removed the syscalltbl.o altogether, as per Jiri's suggestion ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 85a9ab62e23fe..90229a88f969b 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -150,10 +150,6 @@ CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET CFLAGS_hweight.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_parse-events.o += -Wno-redundant-decls -$(OUTPUT)util/syscalltbl.o: util/syscalltbl.c arch/x86/entry/syscalls/syscall_64.tbl $(OUTPUT)arch/x86/include/generated/asm/syscalls_64.c FORCE - $(call rule_mkdir) - $(call if_changed_dep,cc_o_c) - $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE $(call rule_mkdir) $(call if_changed_dep,cc_o_c) -- GitLab From e02092b9a922f17e951b2df5f12f4aafe7383a21 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 12:12:49 -0300 Subject: [PATCH 290/705] perf symbols: Allow loading kallsyms without considering kcore files Before the support for using /proc/kcore was introduced, the kallsyms routines used /proc/modules and the first 'perf test' entry expected finding maps for each module in the system, which is not the case with the kcore code. Provide a way to ignore kcore files so that the test can have its expectations met. Improving the test to cover kcore files as well needs to be done. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ek5urnu103dlhfk4l6pcw041@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 12 +++++++++--- tools/perf/util/machine.h | 2 ++ tools/perf/util/symbol.c | 12 +++++++++--- tools/perf/util/symbol.h | 2 ++ 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 52b51e004fe8f..656c1d7ee7d46 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -908,11 +908,11 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid) return machine__create_kernel_maps(machine); } -int machine__load_kallsyms(struct machine *machine, const char *filename, - enum map_type type, symbol_filter_t filter) +int __machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, bool no_kcore, symbol_filter_t filter) { struct map *map = machine__kernel_map(machine); - int ret = dso__load_kallsyms(map->dso, filename, map, filter); + int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore, filter); if (ret > 0) { dso__set_loaded(map->dso, type); @@ -927,6 +927,12 @@ int machine__load_kallsyms(struct machine *machine, const char *filename, return ret; } +int machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, symbol_filter_t filter) +{ + return __machine__load_kallsyms(machine, filename, type, false, filter); +} + int machine__load_vmlinux_path(struct machine *machine, enum map_type type, symbol_filter_t filter) { diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 382873bdc5635..4822de5e4544d 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -215,6 +215,8 @@ struct symbol *machine__find_kernel_function_by_name(struct machine *machine, struct map *machine__findnew_module_map(struct machine *machine, u64 start, const char *filename); +int __machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, bool no_kcore, symbol_filter_t filter); int machine__load_kallsyms(struct machine *machine, const char *filename, enum map_type type, symbol_filter_t filter); int machine__load_vmlinux_path(struct machine *machine, enum map_type type, diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index a36823c3b7c0d..415c4f6d98fd4 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1208,8 +1208,8 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta) return 0; } -int dso__load_kallsyms(struct dso *dso, const char *filename, - struct map *map, symbol_filter_t filter) +int __dso__load_kallsyms(struct dso *dso, const char *filename, + struct map *map, bool no_kcore, symbol_filter_t filter) { u64 delta = 0; @@ -1230,12 +1230,18 @@ int dso__load_kallsyms(struct dso *dso, const char *filename, else dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS; - if (!dso__load_kcore(dso, map, filename)) + if (!no_kcore && !dso__load_kcore(dso, map, filename)) return dso__split_kallsyms_for_kcore(dso, map, filter); else return dso__split_kallsyms(dso, map, delta, filter); } +int dso__load_kallsyms(struct dso *dso, const char *filename, + struct map *map, symbol_filter_t filter) +{ + return __dso__load_kallsyms(dso, filename, map, false, filter); +} + static int dso__load_perf_map(struct dso *dso, struct map *map, symbol_filter_t filter) { diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 1da7b101bc7f8..c8e43979ed5c4 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -240,6 +240,8 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, symbol_filter_t filter); int dso__load_vmlinux_path(struct dso *dso, struct map *map, symbol_filter_t filter); +int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, + bool no_kcore, symbol_filter_t filter); int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, symbol_filter_t filter); -- GitLab From 53d0fe68275dbdaf6a532bb4e87f00db5d36c140 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 12:16:55 -0300 Subject: [PATCH 291/705] perf test: Ignore kcore files in the "vmlinux matches kallsyms" test Before: # perf test -v kallsyms Maps only in vmlinux: ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text ffffffffa0000000-ffffffffa000c000 0 [fjes] ffffffffa000c000-ffffffffa0017000 0 [video] ffffffffa0017000-ffffffffa001c000 0 [grace] ffffffffa0a7f000-ffffffffa0ba5000 0 [xfs] ffffffffa0ba5000-ffffffffffffffff 0 [veth] Maps in vmlinux with a different name in kallsyms: Maps only in kallsyms: ffff880000100000-ffff88001000b000 80000103000 [kernel.kallsyms] ffff88001000b000-ffff880100000000 8001000e000 [kernel.kallsyms] ffff880100000000-ffffc90000000000 80100003000 [kernel.kallsyms] ffffffffa0000000-ffffffffff600000 7fffa0003000 [kernel.kallsyms] ffffffffff600000-ffffffffffffffff 7fffff603000 [kernel.kallsyms] test child finished with -1 ---- end ---- vmlinux symtab matches kallsyms: FAILED! # After: # perf test -v 1 1: vmlinux symtab matches kallsyms : --- start --- test child forked, pid 7058 Looking at the vmlinux_path (8 entries long) Using /lib/modules/4.6.0-rc1+/build/vmlinux for symbols 0xffffffff81076870: diff end addr for aesni_gcm_dec v: 0xffffffff810791f2 k: 0xffffffff81076902 0xffffffff81079200: diff end addr for aesni_gcm_enc v: 0xffffffff8107bb03 k: 0xffffffff81079292 0xffffffff8107e8d0: diff end addr for aesni_gcm_enc_avx_gen2 v: 0xffffffff81083e76 k: 0xffffffff8107e943 0xffffffff81083e80: diff end addr for aesni_gcm_dec_avx_gen2 v: 0xffffffff81089611 k: 0xffffffff81083ef3 0xffffffff81089990: diff end addr for aesni_gcm_enc_avx_gen4 v: 0xffffffff8108e7c4 k: 0xffffffff81089a03 0xffffffff8108e7d0: diff end addr for aesni_gcm_dec_avx_gen4 v: 0xffffffff810937ef k: 0xffffffff8108e843 Maps only in vmlinux: ffffffff81d5e000-ffffffff81ec3ac8 115e000 [kernel].init.text ffffffff81ec3ac8-ffffffffa0000000 12c3ac8 [kernel].exit.text Maps in vmlinux with a different name in kallsyms: Maps only in kallsyms: test child finished with -1 ---- end ---- vmlinux symtab matches kallsyms: FAILED! # Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: 8e0cf965f95e ("perf symbols: Add support for reading from /proc/kcore") Link: http://lkml.kernel.org/n/tip-n6vrwt9t89w8k769y349govx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/vmlinux-kallsyms.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 630b0b409b973..c05f1bdd92103 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused) * Step 3: * * Load and split /proc/kallsyms into multiple maps, one per module. + * Do not use kcore, as this test was designed before kcore support + * and has parts that only make sense if using the non-kcore code. + * XXX: extend it to stress the kcorre code as well, hint: the list + * of modules extracted from /proc/kcore, in its current form, can't + * be compacted against the list of modules found in the "vmlinux" + * code and with the one got from /proc/modules from the "kallsyms" code. */ - if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) { + if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) { pr_debug("dso__load_kallsyms "); goto out; } -- GitLab From 6566feafb4dba4eef30a9c0b25e6f49f996178b6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 12:22:25 -0300 Subject: [PATCH 292/705] perf test: Add missing verbose output explaining the reason for failure One of the branches leading to an error had no debug message emitted, fix it, the new lines are: # perf test -v kallsyms 0xffffffff81001000: diff name v: xen_hypercall_set_trap_table k: hypercall_page 0xffffffff810691f0: diff name v: try_to_free_pud_page k: try_to_free_pmd_page 0xffffffff8150bb20: diff name v: wakeup_expire_count_show.part.5 k: wakeup_active_count_show.part.7 0xffffffff816bc7f0: diff name v: phys_switch_id_show.part.11 k: phys_port_name_show.part.12 0xffffffff817bbb90: diff name v: __do_softirq k: __softirqentry_text_start This in turn exercises another bug, still under investigation, because those aliases _are_ in kallsyms, with the same name... Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: ab414dcda8fa ("perf test: Fixup aliases checking in the 'vmlinux matches kallsyms' test") Link: http://lkml.kernel.org/n/tip-5fhea7a54a54gsmagu9obpr4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/vmlinux-kallsyms.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index c05f1bdd92103..e63abab7d5a17 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -163,6 +163,9 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused) pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n", mem_start, sym->name, pair->name); + } else { + pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n", + mem_start, sym->name, first_pair->name); } } } else -- GitLab From 5c1458478c49b905652fc002708d09369763f58f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2016 16:49:24 -0800 Subject: [PATCH 293/705] documentation: Add documentation for RCU's major data structures This commit adds documentation for RCU's major data structures, including rcu_state, rcu_node, rcu_data, rcu_dynticks, and rcu_head. Signed-off-by: Paul E. McKenney --- .../Data-Structures/BigTreeClassicRCU.svg | 474 ++++++ .../Data-Structures/BigTreeClassicRCUBH.svg | 499 ++++++ .../BigTreeClassicRCUBHdyntick.svg | 695 +++++++++ .../BigTreePreemptRCUBHdyntick.svg | 741 +++++++++ .../BigTreePreemptRCUBHdyntickCB.svg | 858 +++++++++++ .../Data-Structures/Data-Structures.html | 1333 +++++++++++++++++ .../Data-Structures/HugeTreeClassicRCU.svg | 939 ++++++++++++ .../RCU/Design/Data-Structures/TreeLevel.svg | 828 ++++++++++ .../Design/Data-Structures/TreeMapping.svg | 305 ++++ .../Data-Structures/TreeMappingLevel.svg | 380 +++++ .../RCU/Design/Data-Structures/blkd_task.svg | 843 +++++++++++ .../RCU/Design/Data-Structures/nxtlist.svg | 396 +++++ 12 files changed, 8291 insertions(+) create mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg create mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg create mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg create mode 100644 Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg create mode 100644 Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg create mode 100644 Documentation/RCU/Design/Data-Structures/Data-Structures.html create mode 100644 Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg create mode 100644 Documentation/RCU/Design/Data-Structures/TreeLevel.svg create mode 100644 Documentation/RCU/Design/Data-Structures/TreeMapping.svg create mode 100644 Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg create mode 100644 Documentation/RCU/Design/Data-Structures/blkd_task.svg create mode 100644 Documentation/RCU/Design/Data-Structures/nxtlist.svg diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg new file mode 100644 index 0000000000000..727e270b11e4e --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg @@ -0,0 +1,474 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + struct + + rcu_data + + CPU 0 + + struct + + rcu_data + + CPU 15 + + struct + + rcu_data + + CPU 1007 + + struct + + rcu_data + + CPU 1023 + + struct rcu_state + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_node + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg new file mode 100644 index 0000000000000..9bbb1944f962d --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg @@ -0,0 +1,499 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + rcu_sched + + + + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg new file mode 100644 index 0000000000000..21ba7823479d4 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg @@ -0,0 +1,695 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_sched + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg new file mode 100644 index 0000000000000..15adcac036c73 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg @@ -0,0 +1,741 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_preempt + + rcu_sched + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg new file mode 100644 index 0000000000000..bbc3801470d09 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -0,0 +1,858 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + struct + + rcu_head + + struct + + rcu_head + + struct + + rcu_head + + rcu_sched + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_preempt + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html new file mode 100644 index 0000000000000..7eb47ac25ad77 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -0,0 +1,1333 @@ + + + A Tour Through TREE_RCU's Data Structures [LWN.net] + + +

    January 27, 2016

    +

    This article was contributed by Paul E. McKenney

    + +

    Introduction

    + +This document describes RCU's major data structures and their relationship +to each other. + +
      +
    1. + Data-Structure Relationships +
    2. + The rcu_state Structure +
    3. + The rcu_node Structure +
    4. + The rcu_data Structure +
    5. + The rcu_dynticks Structure +
    6. + The rcu_head Structure +
    7. + RCU-Specific Fields in the task_struct Structure +
    8. + Accessor Functions +
    + +At the end we have the +answers to the quick quizzes. + +

    Data-Structure Relationships

    + +

    RCU is for all intents and purposes a large state machine, and its +data structures maintain the state in such a way as to allow RCU readers +to execute extremely quickly, while also processing the RCU grace periods +requested by updaters in an efficient and extremely scalable fashion. +The efficiency and scalability of RCU updaters is provided primarily +by a combining tree, as shown below: + +

    BigTreeClassicRCU.svg + +

    This diagram shows an enclosing rcu_state structure +containing a tree of rcu_node structures. +Each leaf node of the rcu_node tree has up to 16 +rcu_data structures associated with it, so that there +are NR_CPUS number of rcu_data structures, +one for each possible CPU. +This structure is adjusted at boot time, if needed, to handle the +common case where nr_cpu_ids is much less than +NR_CPUs. +For example, a number of Linux distributions set NR_CPUs=4096, +which results in a three-level rcu_node tree. +If the actual hardware has only 16 CPUs, RCU will adjust itself +at boot time, resulting in an rcu_node tree with only a single node. + +

    The purpose of this combining tree is to allow per-CPU events +such as quiescent states, dyntick-idle transitions, +and CPU hotplug operations to be processed efficiently +and scalably. +Quiescent states are recorded by the per-CPU rcu_data structures, +and other events are recorded by the leaf-level rcu_node +structures. +All of these events are combined at each level of the tree until finally +grace periods are completed at the tree's root rcu_node +structure. +A grace period can be completed at the root once every CPU +(or, in the case of CONFIG_PREEMPT_RCU, task) +has passed through a quiescent state. +Once a grace period has completed, record of that fact is propagated +back down the tree. + +

    As can be seen from the diagram, on a 64-bit system +a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout +of 64 at the root and a fanout of 16 at the leaves. + + + + + + + + +
     
    Quick Quiz:
    + Why isn't the fanout at the leaves also 64? +
    Answer:
    + Because there are more types of events that affect the leaf-level + rcu_node structures than further up the tree. + Therefore, if the leaf rcu_node structures have fanout of + 64, the contention on these structures' ->structures + becomes excessive. + Experimentation on a wide variety of systems has shown that a fanout + of 16 works well for the leaves of the rcu_node tree. + + +

    Of course, further experience with + systems having hundreds or thousands of CPUs may demonstrate + that the fanout for the non-leaf rcu_node structures + must also be reduced. + Such reduction can be easily carried out when and if it proves + necessary. + In the meantime, if you are using such a system and running into + contention problems on the non-leaf rcu_node structures, + you may use the CONFIG_RCU_FANOUT kernel configuration + parameter to reduce the non-leaf fanout as needed. + + +

    Kernels built for systems with + strong NUMA characteristics might also need to adjust + CONFIG_RCU_FANOUT so that the domains of the + rcu_node structures align with hardware boundaries. + However, there has thus far been no need for this. +

     
    + +

    If your system has more than 1,024 CPUs (or more than 512 CPUs on +a 32-bit system), then RCU will automatically add more levels to the +tree. +For example, if you are crazy enough to build a 64-bit system with 65,536 +CPUs, RCU would configure the rcu_node tree as follows: + +

    HugeTreeClassicRCU.svg + +

    RCU currently permits up to a four-level tree, which on a 64-bit system +accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for +32-bit systems. +On the other hand, you can set CONFIG_RCU_FANOUT to be +as small as 2 if you wish, which would permit only 16 CPUs, which +is useful for testing. + +

    This multi-level combining tree allows us to get most of the +performance and scalability +benefits of partitioning, even though RCU grace-period detection is +inherently a global operation. +The trick here is that only the last CPU to report a quiescent state +into a given rcu_node structure need advance to the rcu_node +structure at the next level up the tree. +This means that at the leaf-level rcu_node structure, only +one access out of sixteen will progress up the tree. +For the internal rcu_node structures, the situation is even +more extreme: Only one access out of sixty-four will progress up +the tree. +Because the vast majority of the CPUs do not progress up the tree, +the lock contention remains roughly constant up the tree. +No matter how many CPUs there are in the system, at most 64 quiescent-state +reports per grace period will progress all the way to the root +rcu_node structure, thus ensuring that the lock contention +on that root rcu_node structure remains acceptably low. + +

    In effect, the combining tree acts like a big shock absorber, +keeping lock contention under control at all tree levels regardless +of the level of loading on the system. + +

    The Linux kernel actually supports multiple flavors of RCU +running concurrently, so RCU builds separate data structures for each +flavor. +For example, for CONFIG_TREE_RCU=y kernels, RCU provides +rcu_sched and rcu_bh, as shown below: + +

    BigTreeClassicRCUBH.svg + +

    Energy efficiency is increasingly important, and for that +reason the Linux kernel provides CONFIG_NO_HZ_IDLE, which +turns off the scheduling-clock interrupts on idle CPUs, which in +turn allows those CPUs to attain deeper sleep states and to consume +less energy. +CPUs whose scheduling-clock interrupts have been turned off are +said to be in dyntick-idle mode. +RCU must handle dyntick-idle CPUs specially +because RCU would otherwise wake up each CPU on every grace period, +which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. +RCU uses the rcu_dynticks structure to track +which CPUs are in dyntick idle mode, as shown below: + +

    BigTreeClassicRCUBHdyntick.svg + +

    However, if a CPU is in dyntick-idle mode, it is in that mode +for all flavors of RCU. +Therefore, a single rcu_dynticks structure is allocated per +CPU, and all of a given CPU's rcu_data structures share +that rcu_dynticks, as shown in the figure. + +

    Kernels built with CONFIG_PREEMPT_RCU support +rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: + +

    BigTreePreemptRCUBHdyntick.svg + +

    RCU updaters wait for normal grace periods by registering +RCU callbacks, either directly via call_rcu() and +friends (namely call_rcu_bh() and call_rcu_sched()), +there being a separate interface per flavor of RCU) +or indirectly via synchronize_rcu() and friends. +RCU callbacks are represented by rcu_head structures, +which are queued on rcu_data structures while they are +waiting for a grace period to elapse, as shown in the following figure: + +

    BigTreePreemptRCUBHdyntickCB.svg + +

    This figure shows how TREE_RCU's and +PREEMPT_RCU's major data structures are related. +Lesser data structures will be introduced with the algorithms that +make use of them. + +

    Note that each of the data structures in the above figure has +its own synchronization: + +

      +
    1. Each rcu_state structures has a lock and a mutex, + and some fields are protected by the corresponding root + rcu_node structure's lock. +
    2. Each rcu_node structure has a spinlock. +
    3. The fields in rcu_data are private to the corresponding + CPU, although a few can be read and written by other CPUs. +
    4. Similarly, the fields in rcu_dynticks are private + to the corresponding CPU, although a few can be read by + other CPUs. +
    + +

    It is important to note that different data structures can have +very different ideas about the state of RCU at any given time. +For but one example, awareness of the start or end of a given RCU +grace period propagates slowly through the data structures. +This slow propagation is absolutely necessary for RCU to have good +read-side performance. +If this balkanized implementation seems foreign to you, one useful +trick is to consider each instance of these data structures to be +a different person, each having the usual slightly different +view of reality. + +

    The general role of each of these data structures is as +follows: + +

      +
    1. rcu_state: + This structure forms the interconnection between the + rcu_node and rcu_data structures, + tracks grace periods, serves as short-term repository + for callbacks orphaned by CPU-hotplug events, + maintains rcu_barrier() state, + tracks expedited grace-period state, + and maintains state used to force quiescent states when + grace periods extend too long, +
    2. rcu_node: This structure forms the combining + tree that propagates quiescent-state + information from the leaves to the root, and also propagates + grace-period information from the root to the leaves. + It provides local copies of the grace-period state in order + to allow this information to be accessed in a synchronized + manner without suffering the scalability limitations that + would otherwise be imposed by global locking. + In CONFIG_PREEMPT_RCU kernels, it manages the lists + of tasks that have blocked while in their current + RCU read-side critical section. + In CONFIG_PREEMPT_RCU with + CONFIG_RCU_BOOST, it manages the + per-rcu_node priority-boosting + kernel threads (kthreads) and state. + Finally, it records CPU-hotplug state in order to determine + which CPUs should be ignored during a given grace period. +
    3. rcu_data: This per-CPU structure is the + focus of quiescent-state detection and RCU callback queuing. + It also tracks its relationship to the corresponding leaf + rcu_node structure to allow more-efficient + propagation of quiescent states up the rcu_node + combining tree. + Like the rcu_node structure, it provides a local + copy of the grace-period information to allow for-free + synchronized + access to this information from the corresponding CPU. + Finally, this structure records past dyntick-idle state + for the corresponding CPU and also tracks statistics. +
    4. rcu_dynticks: + This per-CPU structure tracks the current dyntick-idle + state for the corresponding CPU. + Unlike the other three structures, the rcu_dynticks + structure is not replicated per RCU flavor. +
    5. rcu_head: + This structure represents RCU callbacks, and is the + only structure allocated and managed by RCU users. + The rcu_head structure is normally embedded + within the RCU-protected data structure. +
    + +

    If all you wanted from this article was a general notion of how +RCU's data structures are related, you are done. +Otherwise, each of the following sections give more details on +the rcu_state, rcu_node, rcu_data, +and rcu_dynticks data structures. + +

    +The rcu_state Structure

    + +

    The rcu_state structure is the base structure that +represents a flavor of RCU. +This structure forms the interconnection between the +rcu_node and rcu_data structures, +tracks grace periods, contains the lock used to +synchronize with CPU-hotplug events, +and maintains state used to force quiescent states when +grace periods extend too long, + +

    A few of the rcu_state structure's fields are discussed, +singly and in groups, in the following sections. +The more specialized fields are covered in the discussion of their +use. + +

    Relationship to rcu_node and rcu_data Structures
    + +This portion of the rcu_state structure is declared +as follows: + +
    +  1   struct rcu_node node[NUM_RCU_NODES];
    +  2   struct rcu_node *level[NUM_RCU_LVLS + 1];
    +  3   struct rcu_data __percpu *rda;
    +
    + + + + + + + + +
     
    Quick Quiz:
    + Wait a minute! + You said that the rcu_node structures formed a tree, + but they are declared as a flat array! + What gives? +
    Answer:
    + The tree is laid out in the array. + The first node In the array is the head, the next set of nodes in the + array are children of the head node, and so on until the last set of + nodes in the array are the leaves. + + +

    See the following diagrams to see how + this works. +

     
    + +

    The rcu_node tree is embedded into the +->node[] array as shown in the following figure: + +

    TreeMapping.svg + +

    One interesting consequence of this mapping is that a +breadth-first traversal of the tree is implemented as a simple +linear scan of the array, which is in fact what the +rcu_for_each_node_breadth_first() macro does. +This macro is used at the beginning and ends of grace periods. + +

    Each entry of the ->level array references +the first rcu_node structure on the corresponding level +of the tree, for example, as shown below: + +

    TreeMappingLevel.svg + +

    The zeroth element of the array references the root +rcu_node structure, the first element references the +first child of the root rcu_node, and finally the second +element references the first leaf rcu_node structure. + +

    For whatever it is worth, if you draw the tree to be tree-shaped +rather than array-shaped, it is easy to draw a planar representation: + +

    TreeLevel.svg + +

    Finally, the ->rda field references a per-CPU +pointer to the corresponding CPU's rcu_data structure. + +

    All of these fields are constant once initialization is complete, +and therefore need no protection. + +

    Grace-Period Tracking
    + +

    This portion of the rcu_state structure is declared +as follows: + +

    +  1   unsigned long gpnum;
    +  2   unsigned long completed;
    +
    + +

    RCU grace periods are numbered, and +the ->gpnum field contains the number of the grace +period that started most recently. +The ->completed field contains the number of the +grace period that completed most recently. +If the two fields are equal, the RCU grace period that most recently +started has already completed, and therefore the corresponding +flavor of RCU is idle. +If ->gpnum is one greater than ->completed, +then ->gpnum gives the number of the current RCU +grace period, which has not yet completed. +Any other combination of values indicates that something is broken. +These two fields are protected by the root rcu_node's +->lock field. + +

    There are ->gpnum and ->completed fields +in the rcu_node and rcu_data structures +as well. +The fields in the rcu_state structure represent the +most current values, and those of the other structures are compared +in order to detect the start of a new grace period in a distributed +fashion. +The values flow from rcu_state to rcu_node +(down the tree from the root to the leaves) to rcu_data. + +

    Miscellaneous
    + +

    This portion of the rcu_state structure is declared +as follows: + +

    +  1   unsigned long gp_max;
    +  2   char abbr;
    +  3   char *name;
    +
    + +

    The ->gp_max field tracks the duration of the longest +grace period in jiffies. +It is protected by the root rcu_node's ->lock. + +

    The ->name field points to the name of the RCU flavor +(for example, “rcu_sched”), and is constant. +The ->abbr field contains a one-character abbreviation, +for example, “s” for RCU-sched. + +

    +The rcu_node Structure

    + +

    The rcu_node structures form the combining +tree that propagates quiescent-state +information from the leaves to the root and also that propagates +grace-period information from the root down to the leaves. +They provides local copies of the grace-period state in order +to allow this information to be accessed in a synchronized +manner without suffering the scalability limitations that +would otherwise be imposed by global locking. +In CONFIG_PREEMPT_RCU kernels, they manage the lists +of tasks that have blocked while in their current +RCU read-side critical section. +In CONFIG_PREEMPT_RCU with +CONFIG_RCU_BOOST, they manage the +per-rcu_node priority-boosting +kernel threads (kthreads) and state. +Finally, they record CPU-hotplug state in order to determine +which CPUs should be ignored during a given grace period. + +

    The rcu_node structure's fields are discussed, +singly and in groups, in the following sections. + +

    Connection to Combining Tree
    + +

    This portion of the rcu_node structure is declared +as follows: + +

    +  1   struct rcu_node *parent;
    +  2   u8 level;
    +  3   u8 grpnum;
    +  4   unsigned long grpmask;
    +  5   int grplo;
    +  6   int grphi;
    +
    + +

    The ->parent pointer references the rcu_node +one level up in the tree, and is NULL for the root +rcu_node. +The RCU implementation makes heavy use of this field to push quiescent +states up the tree. +The ->level field gives the level in the tree, with +the root being at level zero, its children at level one, and so on. +The ->grpnum field gives this node's position within +the children of its parent, so this number can range between 0 and 31 +on 32-bit systems and between 0 and 63 on 64-bit systems. +The ->level and ->grpnum fields are +used only during initialization and for tracing. +The ->grpmask field is the bitmask counterpart of +->grpnum, and therefore always has exactly one bit set. +This mask is used to clear the bit corresponding to this rcu_node +structure in its parent's bitmasks, which are described later. +Finally, the ->grplo and ->grphi fields +contain the lowest and highest numbered CPU served by this +rcu_node structure, respectively. + +

    All of these fields are constant, and thus do not require any +synchronization. + +

    Synchronization
    + +

    This field of the rcu_node structure is declared +as follows: + +

    +  1   raw_spinlock_t lock;
    +
    + +

    This field is used to protect the remaining fields in this structure, +unless otherwise stated. +That said, all of the fields in this structure can be accessed without +locking for tracing purposes. +Yes, this can result in confusing traces, but better some tracing confusion +than to be heisenbugged out of existence. + +

    Grace-Period Tracking
    + +

    This portion of the rcu_node structure is declared +as follows: + +

    +  1   unsigned long gpnum;
    +  2   unsigned long completed;
    +
    + +

    These fields are the counterparts of the fields of the same name in +the rcu_state structure. +They each may lag up to one behind their rcu_state +counterparts. +If a given rcu_node structure's ->gpnum and +->complete fields are equal, then this rcu_node +structure believes that RCU is idle. +Otherwise, as with the rcu_state structure, +the ->gpnum field will be one greater than the +->complete fields, with ->gpnum +indicating which grace period this rcu_node believes +is still being waited for. + +

    The >gpnum field of each rcu_node +structure is updated at the beginning +of each grace period, and the ->completed fields are +updated at the end of each grace period. + +

    Quiescent-State Tracking
    + +

    These fields manage the propagation of quiescent states up the +combining tree. + +

    This portion of the rcu_node structure has fields +as follows: + +

    +  1   unsigned long qsmask;
    +  2   unsigned long expmask;
    +  3   unsigned long qsmaskinit;
    +  4   unsigned long expmaskinit;
    +
    + +

    The ->qsmask field tracks which of this +rcu_node structure's children still need to report +quiescent states for the current normal grace period. +Such children will have a value of 1 in their corresponding bit. +Note that the leaf rcu_node structures should be +thought of as having rcu_data structures as their +children. +Similarly, the ->expmask field tracks which +of this rcu_node structure's children still need to report +quiescent states for the current expedited grace period. +An expedited grace period has +the same conceptual properties as a normal grace period, but the +expedited implementation accepts extreme CPU overhead to obtain +much lower grace-period latency, for example, consuming a few +tens of microseconds worth of CPU time to reduce grace-period +duration from milliseconds to tens of microseconds. +The ->qsmaskinit field tracks which of this +rcu_node structure's children cover for at least +one online CPU. +This mask is used to initialize ->qsmask, +and ->expmaskinit is used to initialize +->expmask and the beginning of the +normal and expedited grace periods, respectively. + + + + + + + + +
     
    Quick Quiz:
    + Why are these bitmasks protected by locking? + Come on, haven't you heard of atomic instructions??? +
    Answer:
    + Lockless grace-period computation! Such a tantalizing possibility! + + +

    But consider the following sequence of events: + + +

      +
    1. CPU 0 has been in dyntick-idle + mode for quite some time. + When it wakes up, it notices that the current RCU + grace period needs it to report in, so it sets a + flag where the scheduling clock interrupt will find it. +

      +

    2. Meanwhile, CPU 1 is running + force_quiescent_state(), + and notices that CPU 0 has been in dyntick idle mode, + which qualifies as an extended quiescent state. +

      +

    3. CPU 0's scheduling clock + interrupt fires in the + middle of an RCU read-side critical section, and notices + that the RCU core needs something, so commences RCU softirq + processing. + +

      +

    4. CPU 0's softirq handler + executes and is just about ready + to report its quiescent state up the rcu_node + tree. +

      +

    5. But CPU 1 beats it to the punch, + completing the current + grace period and starting a new one. +

      +

    6. CPU 0 now reports its quiescent + state for the wrong + grace period. + That grace period might now end before the RCU read-side + critical section. + If that happens, disaster will ensue. + +
    + +

    So the locking is absolutely required in + order to coordinate + clearing of the bits with the grace-period numbers in + ->gpnum and ->completed. +

     
    + +

    Blocked-Task Management
    + +

    PREEMPT_RCU allows tasks to be preempted in the +midst of their RCU read-side critical sections, and these tasks +must be tracked explicitly. +The details of exactly why and how they are tracked will be covered +in a separate article on RCU read-side processing. +For now, it is enough to know that the rcu_node +structure tracks them. + +

    +  1   struct list_head blkd_tasks;
    +  2   struct list_head *gp_tasks;
    +  3   struct list_head *exp_tasks;
    +  4   bool wait_blkd_tasks;
    +
    + +

    The ->blkd_tasks field is a list header for +the list of blocked and preempted tasks. +As tasks undergo context switches within RCU read-side critical +sections, their task_struct structures are enqueued +(via the task_struct's ->rcu_node_entry +field) onto the head of the ->blkd_tasks list for the +leaf rcu_node structure corresponding to the CPU +on which the outgoing context switch executed. +As these tasks later exit their RCU read-side critical sections, +they remove themselves from the list. +This list is therefore in reverse time order, so that if one of the tasks +is blocking the current grace period, all subsequent tasks must +also be blocking that same grace period. +Therefore, a single pointer into this list suffices to track +all tasks blocking a given grace period. +That pointer is stored in ->gp_tasks for normal +grace periods and in ->exp_tasks for expedited +grace periods. +These last two fields are NULL if either there is +no grace period in flight or if there are no blocked tasks +preventing that grace period from completing. +If either of these two pointers is referencing a task that +removes itself from the ->blkd_tasks list, +then that task must advance the pointer to the next task on +the list, or set the pointer to NULL if there +are no subsequent tasks on the list. + +

    For example, suppose that tasks T1, T2, and T3 are +all hard-affinitied to the largest-numbered CPU in the system. +Then if task T1 blocked in an RCU read-side +critical section, then an expedited grace period started, +then task T2 blocked in an RCU read-side critical section, +then a normal grace period started, and finally task 3 blocked +in an RCU read-side critical section, then the state of the +last leaf rcu_node structure's blocked-task list +would be as shown below: + +

    blkd_task.svg + +

    Task T1 is blocking both grace periods, task T2 is +blocking only the normal grace period, and task T3 is blocking +neither grace period. +Note that these tasks will not remove themselves from this list +immediately upon resuming execution. +They will instead remain on the list until they execute the outermost +rcu_read_unlock() that ends their RCU read-side critical +section. + +

    +The ->wait_blkd_tasks field indicates whether or not +the current grace period is waiting on a blocked task. + +

    Sizing the rcu_node Array
    + +

    The rcu_node array is sized via a series of +C-preprocessor expressions as follows: + +

    + 1 #ifdef CONFIG_RCU_FANOUT
    + 2 #define RCU_FANOUT CONFIG_RCU_FANOUT
    + 3 #else
    + 4 # ifdef CONFIG_64BIT
    + 5 # define RCU_FANOUT 64
    + 6 # else
    + 7 # define RCU_FANOUT 32
    + 8 # endif
    + 9 #endif
    +10
    +11 #ifdef CONFIG_RCU_FANOUT_LEAF
    +12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
    +13 #else
    +14 # ifdef CONFIG_64BIT
    +15 # define RCU_FANOUT_LEAF 64
    +16 # else
    +17 # define RCU_FANOUT_LEAF 32
    +18 # endif
    +19 #endif
    +20
    +21 #define RCU_FANOUT_1        (RCU_FANOUT_LEAF)
    +22 #define RCU_FANOUT_2        (RCU_FANOUT_1 * RCU_FANOUT)
    +23 #define RCU_FANOUT_3        (RCU_FANOUT_2 * RCU_FANOUT)
    +24 #define RCU_FANOUT_4        (RCU_FANOUT_3 * RCU_FANOUT)
    +25
    +26 #if NR_CPUS <= RCU_FANOUT_1
    +27 #  define RCU_NUM_LVLS        1
    +28 #  define NUM_RCU_LVL_0        1
    +29 #  define NUM_RCU_NODES        NUM_RCU_LVL_0
    +30 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
    +31 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
    +32 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
    +33 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
    +34 #elif NR_CPUS <= RCU_FANOUT_2
    +35 #  define RCU_NUM_LVLS        2
    +36 #  define NUM_RCU_LVL_0        1
    +37 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
    +38 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
    +39 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
    +40 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
    +41 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
    +42 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
    +43 #elif NR_CPUS <= RCU_FANOUT_3
    +44 #  define RCU_NUM_LVLS        3
    +45 #  define NUM_RCU_LVL_0        1
    +46 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
    +47 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
    +48 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
    +49 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
    +50 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
    +51 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
    +52 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
    +53 #elif NR_CPUS <= RCU_FANOUT_4
    +54 #  define RCU_NUM_LVLS        4
    +55 #  define NUM_RCU_LVL_0        1
    +56 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
    +57 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
    +58 #  define NUM_RCU_LVL_3        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
    +59 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
    +60 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
    +61 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
    +62 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
    +63 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
    +64 #else
    +65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
    +66 #endif
    +
    + +

    The maximum number of levels in the rcu_node structure +is currently limited to four, as specified by lines 21-24 +and the structure of the subsequent “if” statement. +For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which +should be sufficient for the next few years at least. +For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which +should see us through the next decade or so. +This four-level tree also allows kernels built with +CONFIG_RCU_FANOUT=8 to support up to 4096 CPUs, +which might be useful in very large systems having eight CPUs per +socket (but please note that no one has yet shown any measurable +performance degradation due to misaligned socket and rcu_node +boundaries). +In addition, building kernels with a full four levels of rcu_node +tree permits better testing of RCU's combining-tree code. + +

    The RCU_FANOUT symbol controls how many children +are permitted at each non-leaf level of the rcu_node tree. +If the CONFIG_RCU_FANOUT Kconfig option is not specified, +it is set based on the word size of the system, which is also +the Kconfig default. + +

    The RCU_FANOUT_LEAF symbol controls how many CPUs are +handled by each leaf rcu_node structure. +Experience has shown that allowing a given leaf rcu_node +structure to handle 64 CPUs, as permitted by the number of bits in +the ->qsmask field on a 64-bit system, results in +excessive contention for the leaf rcu_node structures' +->lock fields. +The number of CPUs per leaf rcu_node structure is therefore +limited to 16 given the default value of CONFIG_RCU_FANOUT_LEAF. +If CONFIG_RCU_FANOUT_LEAF is unspecified, the value +selected is based on the word size of the system, just as for +CONFIG_RCU_FANOUT. +Lines 11-19 perform this computation. + +

    Lines 21-24 compute the maximum number of CPUs supported by +a single-level (which contains a single rcu_node structure), +two-level, three-level, and four-level rcu_node tree, +respectively, given the fanout specified by RCU_FANOUT +and RCU_FANOUT_LEAF. +These numbers of CPUs are retained in the +RCU_FANOUT_1, +RCU_FANOUT_2, +RCU_FANOUT_3, and +RCU_FANOUT_4 +C-preprocessor variables, respectively. + +

    These variables are used to control the C-preprocessor #if +statement spanning lines 26-66 that computes the number of +rcu_node structures required for each level of the tree, +as well as the number of levels required. +The number of levels is placed in the NUM_RCU_LVLS +C-preprocessor variable by lines 27, 35, 44, and 54. +The number of rcu_node structures for the topmost level +of the tree is always exactly one, and this value is unconditionally +placed into NUM_RCU_LVL_0 by lines 28, 36, 45, and 55. +The rest of the levels (if any) of the rcu_node tree +are computed by dividing the maximum number of CPUs by the +fanout supported by the number of levels from the current level down, +rounding up. This computation is performed by lines 37, +46-47, and 56-58. +Lines 31-33, 40-42, 50-52, and 62-63 create initializers +for lockdep lock-class names. +Finally, lines 64-66 produce an error if the maximum number of +CPUs is too large for the specified fanout. + +

    +The rcu_data Structure

    + +

    The rcu_data maintains the per-CPU state for the +corresponding flavor of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +This structure is the +focus of quiescent-state detection and RCU callback queuing. +It also tracks its relationship to the corresponding leaf +rcu_node structure to allow more-efficient +propagation of quiescent states up the rcu_node +combining tree. +Like the rcu_node structure, it provides a local +copy of the grace-period information to allow for-free +synchronized +access to this information from the corresponding CPU. +Finally, this structure records past dyntick-idle state +for the corresponding CPU and also tracks statistics. + +

    The rcu_data structure's fields are discussed, +singly and in groups, in the following sections. + +

    Connection to Other Data Structures
    + +

    This portion of the rcu_data structure is declared +as follows: + +

    +  1   int cpu;
    +  2   struct rcu_state *rsp;
    +  3   struct rcu_node *mynode;
    +  4   struct rcu_dynticks *dynticks;
    +  5   unsigned long grpmask;
    +  6   bool beenonline;
    +
    + +

    The ->cpu field contains the number of the +corresponding CPU, the ->rsp pointer references +the corresponding rcu_state structure (and is most frequently +used to locate the name of the corresponding flavor of RCU for tracing), +and the ->mynode field references the corresponding +rcu_node structure. +The ->mynode is used to propagate quiescent states +up the combining tree. +

    The ->dynticks pointer references the +rcu_dynticks structure corresponding to this +CPU. +Recall that a single per-CPU instance of the rcu_dynticks +structure is shared among all flavors of RCU. +These first four fields are constant and therefore require not +synchronization. + +

    The ->grpmask field indicates the bit in +the ->mynode->qsmask corresponding to this +rcu_data structure, and is also used when propagating +quiescent states. +The ->beenonline flag is set whenever the corresponding +CPU comes online, which means that the debugfs tracing need not dump +out any rcu_data structure for which this flag is not set. + +

    Quiescent-State and Grace-Period Tracking
    + +

    This portion of the rcu_data structure is declared +as follows: + +

    +  1   unsigned long completed;
    +  2   unsigned long gpnum;
    +  3   bool cpu_no_qs;
    +  4   bool core_needs_qs;
    +  5   bool gpwrap;
    +  6   unsigned long rcu_qs_ctr_snap;
    +
    + +

    The completed and gpnum +fields are the counterparts of the fields of the same name +in the rcu_state and rcu_node structures. +They may each lag up to one behind their rcu_node +counterparts, but in CONFIG_NO_HZ_IDLE and +CONFIG_NO_HZ_FULL kernels can lag +arbitrarily far behind for CPUs in dyntick-idle mode (but these counters +will catch up upon exit from dyntick-idle mode). +If a given rcu_data structure's ->gpnum and +->complete fields are equal, then this rcu_data +structure believes that RCU is idle. +Otherwise, as with the rcu_state and rcu_node +structure, +the ->gpnum field will be one greater than the +->complete fields, with ->gpnum +indicating which grace period this rcu_data believes +is still being waited for. + + + + + + + + +
     
    Quick Quiz:
    + All this replication of the grace period numbers can only cause + massive confusion. + Why not just keep a global pair of counters and be done with it??? +
    Answer:
    + Because if there was only a single global pair of grace-period + numbers, there would need to be a single global lock to allow + safely accessing and updating them. + And if we are not going to have a single global lock, we need + to carefully manage the numbers on a per-node basis. + Recall from the answer to a previous Quick Quiz that the consequences + of applying a previously sampled quiescent state to the wrong + grace period are quite severe. +
     
    + +

    The ->cpu_no_qs flag indicates that the +CPU has not yet passed through a quiescent state, +while the ->core_needs_qs flag indicates that the +RCU core needs a quiescent state from the corresponding CPU. +The ->gpwrap field indicates that the corresponding +CPU has remained idle for so long that the completed +and gpnum counters are in danger of overflow, which +will cause the CPU to disregard the values of its counters on +its next exit from idle. +Finally, the rcu_qs_ctr_snap field is used to detect +cases where a given operation has resulted in a quiescent state +for all flavors of RCU, for example, cond_resched_rcu_qs(). + +

    RCU Callback Handling
    + +

    In the absence of CPU-hotplug events, RCU callbacks are invoked by +the same CPU that registered them. +This is strictly a cache-locality optimization: callbacks can and +do get invoked on CPUs other than the one that registered them. +After all, if the CPU that registered a given callback has gone +offline before the callback can be invoked, there really is no other +choice. + +

    This portion of the rcu_data structure is declared +as follows: + +

    + 1 struct rcu_head *nxtlist;
    + 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
    + 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
    + 4 long qlen_lazy;
    + 5 long qlen;
    + 6 long qlen_last_fqs_check;
    + 7 unsigned long n_force_qs_snap;
    + 8 unsigned long n_cbs_invoked;
    + 9 unsigned long n_cbs_orphaned;
    +10 unsigned long n_cbs_adopted;
    +11 long blimit;
    +
    + +

    The ->nxtlist pointer and the +->nxttail[] array form a four-segment list with +older callbacks near the head and newer ones near the tail. +Each segment contains callbacks with the corresponding relationship +to the current grace period. +The pointer out of the end of each of the four segments is referenced +by the element of the ->nxttail[] array indexed by +RCU_DONE_TAIL (for callbacks handled by a prior grace period), +RCU_WAIT_TAIL (for callbacks waiting on the current grace period), +RCU_NEXT_READY_TAIL (for callbacks that will wait on the next +grace period), and +RCU_NEXT_TAIL (for callbacks that are not yet associated +with a specific grace period) +respectively, as shown in the following figure. + +

    nxtlist.svg + +

    In this figure, the ->nxtlist pointer references the +first +RCU callback in the list. +The ->nxttail[RCU_DONE_TAIL] array element references +the ->nxtlist pointer itself, indicating that none +of the callbacks is ready to invoke. +The ->nxttail[RCU_WAIT_TAIL] array element references callback +CB 2's ->next pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period. +The ->nxttail[RCU_NEXT_READY_TAIL] array element +references the same RCU callback that ->nxttail[RCU_WAIT_TAIL] +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The ->nxttail[RCU_NEXT_TAIL] array element references +CB 4's ->next pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the ->nxttail[RCU_NEXT_TAIL] array element +always references the last RCU callback's ->next pointer +unless the callback list is empty, in which case it references +the ->nxtlist pointer. + +

    CPUs advance their callbacks from the +RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the +RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments +as grace periods advance. +The CPU advances the callbacks in its rcu_data structure +whenever it notices that another RCU grace period has completed. +The CPU detects the completion of an RCU grace period by noticing +that the value of its rcu_data structure's +->completed field differs from that of its leaf +rcu_node structure. +Recall that each rcu_node structure's +->completed field is updated at the end of each +grace period. + +

    The ->nxtcompleted[] array records grace-period +numbers corresponding to the list segments. +This allows CPUs that go idle for extended periods to determine +which of their callbacks are ready to be invoked after reawakening. + +

    The ->qlen counter contains the number of +callbacks in ->nxtlist, and the +->qlen_lazy contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. +The ->qlen_last_fqs_check and +->n_force_qs_snap coordinate the forcing of quiescent +states from call_rcu() and friends when callback +lists grow excessively long. + +

    The ->n_cbs_invoked, +->n_cbs_orphaned, and ->n_cbs_adopted +fields count the number of callbacks invoked, +sent to other CPUs when this CPU goes offline, +and received from other CPUs when those other CPUs go offline. +Finally, the ->blimit counter is the maximum number of +RCU callbacks that may be invoked at a given time. + +

    Dyntick-Idle Handling
    + +

    This portion of the rcu_data structure is declared +as follows: + +

    +  1   int dynticks_snap;
    +  2   unsigned long dynticks_fqs;
    +
    + +The ->dynticks_snap field is used to take a snapshot +of the corresponding CPU's dyntick-idle state when forcing +quiescent states, and is therefore accessed from other CPUs. +Finally, the ->dynticks_fqs field is used to +count the number of times this CPU is determined to be in +dyntick-idle state, and is used for tracing and debugging purposes. + +

    +The rcu_dynticks Structure

    + +

    The rcu_dynticks maintains the per-CPU dyntick-idle state +for the corresponding CPU. +Unlike the other structures, rcu_dynticks is not +replicated over the different flavors of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +Its fields are as follows: + +

    +  1   int dynticks_nesting;
    +  2   int dynticks_nmi_nesting;
    +  3   atomic_t dynticks;
    +
    + +

    The ->dynticks_nesting field counts the +nesting depth of normal interrupts. +In addition, this counter is incremented when exiting dyntick-idle +mode and decremented when entering it. +This counter can therefore be thought of as counting the number +of reasons why this CPU cannot be permitted to enter dyntick-idle +mode, aside from non-maskable interrupts (NMIs). +NMIs are counted by the ->dynticks_nmi_nesting +field, except that NMIs that interrupt non-dyntick-idle execution +are not counted. + +

    Finally, the ->dynticks field counts the corresponding +CPU's transitions to and from dyntick-idle mode, so that this counter +has an even value when the CPU is in dyntick-idle mode and an odd +value otherwise. + + + + + + + + +
     
    Quick Quiz:
    + Why not just count all NMIs? + Wouldn't that be simpler and less error prone? +
    Answer:
    + It seems simpler only until you think hard about how to go about + updating the rcu_dynticks structure's + ->dynticks field. +
     
    + +

    Additional fields are present for some special-purpose +builds, and are discussed separately. + +

    +The rcu_head Structure

    + +

    Each rcu_head structure represents an RCU callback. +These structures are normally embedded within RCU-protected data +structures whose algorithms use asynchronous grace periods. +In contrast, when using algorithms that block waiting for RCU grace periods, +RCU users need not provide rcu_head structures. + +

    The rcu_head structure has fields as follows: + +

    +  1   struct rcu_head *next;
    +  2   void (*func)(struct rcu_head *head);
    +
    + +

    The ->next field is used +to link the rcu_head structures together in the +lists within the rcu_data structures. +The ->func field is a pointer to the function +to be called when the callback is ready to be invoked, and +this function is passed a pointer to the rcu_head +structure. +However, kfree_rcu() uses the ->func +field to record the offset of the rcu_head +structure within the enclosing RCU-protected data structure. + +

    Both of these fields are used internally by RCU. +From the viewpoint of RCU users, this structure is an +opaque “cookie”. + + + + + + + + +
     
    Quick Quiz:
    + Given that the callback function ->func + is passed a pointer to the rcu_head structure, + how is that function supposed to find the beginning of the + enclosing RCU-protected data structure? +
    Answer:
    + In actual practice, there is a separate callback function per + type of RCU-protected data structure. + The callback function can therefore use the container_of() + macro in the Linux kernel (or other pointer-manipulation facilities + in other software environments) to find the beginning of the + enclosing structure. +
     
    + +

    +RCU-Specific Fields in the task_struct Structure

    + +

    The CONFIG_PREEMPT_RCU implementation uses some +additional fields in the task_struct structure: + +

    + 1 #ifdef CONFIG_PREEMPT_RCU
    + 2   int rcu_read_lock_nesting;
    + 3   union rcu_special rcu_read_unlock_special;
    + 4   struct list_head rcu_node_entry;
    + 5   struct rcu_node *rcu_blocked_node;
    + 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */
    + 7 #ifdef CONFIG_TASKS_RCU
    + 8   unsigned long rcu_tasks_nvcsw;
    + 9   bool rcu_tasks_holdout;
    +10   struct list_head rcu_tasks_holdout_list;
    +11   int rcu_tasks_idle_cpu;
    +12 #endif /* #ifdef CONFIG_TASKS_RCU */
    +
    + +

    The ->rcu_read_lock_nesting field records the +nesting level for RCU read-side critical sections, and +the ->rcu_read_unlock_special field is a bitmask +that records special conditions that require rcu_read_unlock() +to do additional work. +The ->rcu_node_entry field is used to form lists of +tasks that have blocked within preemptible-RCU read-side critical +sections and the ->rcu_blocked_node field references +the rcu_node structure whose list this task is a member of, +or NULL if it is not blocked within a preemptible-RCU +read-side critical section. + +

    The ->rcu_tasks_nvcsw field tracks the number of +voluntary context switches that this task had undergone at the +beginning of the current tasks-RCU grace period, +->rcu_tasks_holdout is set if the current tasks-RCU +grace period is waiting on this task, ->rcu_tasks_holdout_list +is a list element enqueuing this task on the holdout list, +and ->rcu_tasks_idle_cpu tracks which CPU this +idle task is running, but only if the task is currently running, +that is, if the CPU is currently idle. + +

    +Accessor Functions

    + +

    The following listing shows the +rcu_get_root(), rcu_for_each_node_breadth_first, +rcu_for_each_nonleaf_node_breadth_first(), and +rcu_for_each_leaf_node() function and macros: + +

    +  1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
    +  2 {
    +  3   return &rsp->node[0];
    +  4 }
    +  5
    +  6 #define rcu_for_each_node_breadth_first(rsp, rnp) \
    +  7   for ((rnp) = &(rsp)->node[0]; \
    +  8        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
    +  9
    + 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
    + 11   for ((rnp) = &(rsp)->node[0]; \
    + 12        (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
    + 13
    + 14 #define rcu_for_each_leaf_node(rsp, rnp) \
    + 15   for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
    + 16        (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
    +
    + +

    The rcu_get_root() simply returns a pointer to the +first element of the specified rcu_state structure's +->node[] array, which is the root rcu_node +structure. + +

    As noted earlier, the rcu_for_each_node_breadth_first() +macro takes advantage of the layout of the rcu_node +structures in the rcu_state structure's +->node[] array, performing a breadth-first traversal by +simply traversing the array in order. +The rcu_for_each_nonleaf_node_breadth_first() macro operates +similarly, but traverses only the first part of the array, thus excluding +the leaf rcu_node structures. +Finally, the rcu_for_each_leaf_node() macro traverses only +the last part of the array, thus traversing only the leaf +rcu_node structures. + + + + + + + + +
     
    Quick Quiz:
    + What do rcu_for_each_nonleaf_node_breadth_first() and + rcu_for_each_leaf_node() do if the rcu_node tree + contains only a single node? +
    Answer:
    + In the single-node case, + rcu_for_each_nonleaf_node_breadth_first() is a no-op + and rcu_for_each_leaf_node() traverses the single node. +
     
    + +

    +Summary

    + +So each flavor of RCU is represented by an rcu_state structure, +which contains a combining tree of rcu_node and +rcu_data structures. +Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle +state is tracked by an rcu_dynticks structure. + +If you made it this far, you are well prepared to read the code +walkthroughs in the other articles in this series. + +

    +Acknowledgments

    + +I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul +Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn +for helping me get this document into a more human-readable state. + +

    +Legal Statement

    + +

    This work represents the view of the author and does not necessarily +represent the view of IBM. + +

    Linux is a registered trademark of Linus Torvalds. + +

    Other company, product, and service names may be trademarks or +service marks of others. + + diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg new file mode 100644 index 0000000000000..2bf12b4682060 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg @@ -0,0 +1,939 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_node + + struct + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + rcu_node + + struct + + struct + + rcu_node + + CPU 0 + + struct + + rcu_data + + CPU 15 + + struct + + rcu_data + + struct + + rcu_data + + CPU 21823 + + CPU 21839 + + rcu_data + + struct + + struct + + rcu_data + + CPU 43679 + + CPU 43695 + + rcu_data + + struct + + struct + + rcu_data + + CPU 65519 + + CPU 65535 + + rcu_data + + struct + + struct rcu_state + + struct + + rcu_node + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg new file mode 100644 index 0000000000000..7a7eb3bac95cc --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg @@ -0,0 +1,828 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_node + + struct + + struct + + rcu_node + + struct + + rcu_node + + rcu_node + + struct + + rcu_node + + struct + + struct + + rcu_node + + ->level[0] + + ->level[1] + + ->level[2] + + struct + + rcu_node + + CPU 15 + + CPU 0 + + CPU 65535 + + CPU 65519 + + CPU 43695 + + CPU 43679 + + CPU 21839 + + CPU 21823 + + struct rcu_state + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg new file mode 100644 index 0000000000000..729cfa9e6cdb8 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg @@ -0,0 +1,305 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0:7 + + 4:7 + + 0:1 + + 2:3 + + 4:5 + + 6:7 + + 0:3 + + struct rcu_state + + diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg new file mode 100644 index 0000000000000..5b416a4b8453f --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg @@ -0,0 +1,380 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ->level[0] + + ->level[1] + + ->level[2] + + 0:7 + + 4:7 + + 0:1 + + 2:3 + + 4:5 + + 6:7 + + 0:3 + + struct rcu_state + + + + + + + + + diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg new file mode 100644 index 0000000000000..00e810bb84194 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -0,0 +1,843 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + rcu_bh + + struct + + rcu_node + + struct + + rcu_node + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct + + rcu_data + + struct rcu_state + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + struct + + rcu_dynticks + + rcu_sched + + T3 + + T2 + + T1 + + + + + + + + + + + + + rcu_node + + struct + + blkd_tasks + + gp_tasks + + exp_tasks + + diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg new file mode 100644 index 0000000000000..abc4cc73a0977 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg @@ -0,0 +1,396 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nxtlist + + nxttail[RCU_DONE_TAIL] + + nxttail[RCU_WAIT_TAIL] + + nxttail[RCU_NEXT_READY_TAIL] + + nxttail[RCU_NEXT_TAIL] + + CB 1 + + next + + CB 3 + + next + + CB 4 + + next + + CB 2 + + next + + -- GitLab From 6e524a603f0b72281019e4ec29b1022388f9f231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2016 14:22:26 -0700 Subject: [PATCH 294/705] rcutorture: Add OS-jitter capability This commit adds a --jitter OS-jitter capability to expose bugs based on no-delay assumptions. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/jitter.sh | 90 +++++++++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 18 ++++ 2 files changed, 108 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/jitter.sh diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh new file mode 100755 index 0000000000000..3633828375e3f --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Alternate sleeping and spinning on randomly selected CPUs. The purpose +# of this script is to inflict random OS jitter on a concurrently running +# test. +# +# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# +# me: Random-number-generator seed salt. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney + +me=$(($1 * 1000)) +duration=$2 +sleepmax=${3-1000000} +spinmax=${4-1000} + +n=1 + +starttime=`awk 'BEGIN { print systime(); }' < /dev/null` + +while : +do + # Check for done. + t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` + if test "$t" -gt "$duration" + then + exit 0; + fi + + # Set affinity to randomly selected CPU + cpus=`ls /sys/devices/system/cpu/*/online | + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | + grep -v '^0*$'` + cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { + srand(n + me + systime()); + ncpus = split(cpus, ca); + curcpu = ca[int(rand() * ncpus + 1)]; + mask = lshift(1, curcpu); + if (mask + 0 <= 0) + mask = 1; + printf("%#x\n", mask); + }' < /dev/null` + n=$(($n+1)) + if ! taskset -p $cpumask $$ > /dev/null 2>&1 + then + echo taskset failure: '"taskset -p ' $cpumask $$ '"' + exit 1 + fi + + # Sleep a random duration + sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * sleepmax)); + }' < /dev/null` + n=$(($n+1)) + sleep .$sleeptime + + # Spin a random duration + limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * spinmax)); + }' < /dev/null` + n=$(($n+1)) + for i in {1..$limit} + do + echo > /dev/null + done +done + +exit 1 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 704e219f67a7a..0d598145873e8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -48,6 +48,7 @@ resdir="" configs="" cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` +jitter=0 . functions.sh @@ -63,6 +64,7 @@ usage () { echo " --dryrun sched|script" echo " --duration minutes" echo " --interactive" + echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" @@ -122,6 +124,11 @@ do --interactive) TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE ;; + --jitter) + checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$' + jitter="$2" + shift + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ + -v jitter="$jitter" \ -v rd=$resdir/$ds/ \ -v dur=$dur \ -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ @@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum) print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; print "fi" } + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + for (j = 0; j < njitter; j++) + print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait" print "if test -z \"$TORTURE_BUILDONLY\"" print "then" -- GitLab From acc1adf5572205c5b3fc9e6983ca8dfb06c94520 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 10:48:06 -0700 Subject: [PATCH 295/705] rcutorture: Don't rebuild identical kernel Currently, if the user specifies multiple runs of a given test configuration, the scripting does multiple kernel builds. This wastes both time and disk space, so this commit makes the scripting use the first build for all runs of a given test configuration. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 73a2656684212..4109f306d8553 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -91,25 +91,33 @@ fi # CONFIG_PCMCIA=n # CONFIG_CARDBUS=n # CONFIG_YENTA=n -if kvm-build.sh $config_template $builddir $T +base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` +if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux then + # Rerunning previous test, so use that test's kernel. + QEMU="`identify_qemu $base_resdir/vmlinux`" + KERNEL=$base_resdir/bzImage + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh +elif kvm-build.sh $config_template $builddir $T +then + # Had to build a kernel for this test. QEMU="`identify_qemu $builddir/vmlinux`" BOOT_IMAGE="`identify_boot_image $QEMU`" cp $builddir/Make*.out $resdir + cp $builddir/vmlinux $resdir cp $builddir/.config $resdir if test -n "$BOOT_IMAGE" then cp $builddir/$BOOT_IMAGE $resdir + KERNEL=$resdir/bzImage else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? fi parse-build.sh $resdir/Make.out $title - if test -f $builddir.wait - then - mv $builddir.wait $builddir.ready - fi else + # Build failed. cp $builddir/Make*.out $resdir cp $builddir/.config $resdir || : echo Build failed, not running KVM, see $resdir. @@ -119,6 +127,10 @@ else fi exit 1 fi +if test -f $builddir.wait +then + mv $builddir.wait $builddir.ready +fi while test -f $builddir.ready do sleep 1 @@ -166,8 +178,8 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 sleep 10 # Give qemu's pid a chance to reach the file if test -s "$resdir/qemu_pid" -- GitLab From e9fb365a8847dfe8a9fccae0dce77abf7276b5da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 11:20:48 -0700 Subject: [PATCH 296/705] rcutorture: Dump trace buffer upon shutdown When running from the scripts, rcutorture is completely headless, so there is no way to to manually dump the trace buffer. This commit therefore unconditionally dumps the trace buffer upon timed shutdown. However, if you are using rmmod to end the test, it is still up to you to manually dump the trace buffer. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/torture.c b/kernel/torture.c index e912ccd960f0c..fa0bdeee17ac3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } -- GitLab From 0aa67e75b3d59cfe412bfa54ca23797e6c2e3270 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 11:40:44 -0700 Subject: [PATCH 297/705] rcutorture: Add irqs-disabled test for call_rcu() Mutation testing carried out by Iftekhar Ahmed of Oregon State University showed that rcutorture is failing to test invocations of call_rcu() having interrupts disabled. This commit therefore adds interrupt disabling around one of the existing invocations of call_rcu() (and friends). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 633a68a094402..084a28a732eb7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1478,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); -- GitLab From a54062c0d95921d4fb0edc8d268021bf387e6c75 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2016 14:16:22 -0700 Subject: [PATCH 298/705] rcutorture: Add boot-time adjustment of leaf fanout Currently, the rcutorture scripts do not test boot-time adjustment of leaf fanout (via the rcutree.rcu_fanout_leaf boot parameter), as was noted during testing carried out by Iftekhar Ahmed of Oregon State University. This commit therefore adjusts TREE04's CONFIG_RCU_FANOUT_LEAF from 4 to 3, and also adds rcutree.rcu_fanout_leaf=4 to its boot parameters. This change forces RCU's boot-time geometry-change code to be exercised. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE04 | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 39a2c6d7d7ec0..17cbe098b115c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 -CONFIG_RCU_FANOUT_LEAF=4 +CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 0fc8a3428938f..e34c334304472 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 -- GitLab From 916633a403702549d37ea353e63a68e5b0dc27ad Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Apr 2016 17:12:31 +0200 Subject: [PATCH 299/705] locking/rwsem: Provide down_write_killable() Now that all the architectures implement the necessary glue code we can introduce down_write_killable(). The only difference wrt. regular down_write() is that the slow path waits in TASK_KILLABLE state and the interruption by the fatal signal is reported as -EINTR to the caller. Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Chris Zankel Cc: David S. Miller Cc: Linus Torvalds Cc: Max Filippov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Signed-off-by: Davidlohr Bueso Cc: Signed-off-by: Jason Low Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/1460041951-22347-12-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/rwsem.h | 6 +++--- include/linux/lockdep.h | 15 +++++++++++++++ include/linux/rwsem.h | 1 + kernel/locking/rwsem.c | 19 +++++++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index d759c5f70f497..453744c1d3475 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -102,9 +102,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) #define ____down_write(sem, slow_path) \ ({ \ long tmp; \ - struct rw_semaphore* ret = sem; \ + struct rw_semaphore* ret; \ asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %1,(%2)\n\t" \ + LOCK_PREFIX " xadd %1,(%3)\n\t" \ /* adds 0xffff0001, returns the old value */ \ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ /* was the active mask 0 before? */\ @@ -112,7 +112,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "+a" (ret) \ + : "+m" (sem->count), "=d" (tmp), "=a" (ret) \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d026b190c5306..accfe56d8c51b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -444,6 +444,18 @@ do { \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ +({ \ + int ____err = 0; \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ____err = lock(_lock); \ + } \ + if (!____err) \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ + ____err; \ +}) + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -452,6 +464,9 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ + lock(_lock) + #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 7d7ae029dac5a..d1c12d160ace1 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -118,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem); * lock for writing */ extern void down_write(struct rw_semaphore *sem); +extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de7..c817216c16152 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -54,6 +54,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); +/* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + /* * trylock for writing -- returns 1 if successful, 0 if contention */ -- GitLab From 00fb16e26ac8559e69c3bb14284f4a548d28ee0d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 13 Apr 2016 11:57:12 +0200 Subject: [PATCH 300/705] locking/rwsem, x86: Add frame annotation for call_rwsem_down_write_failed_killable() 3387a535ce62 ("x86/asm: Create stack frames in rwsem functions") has added FRAME_{BEGIN,END} annotations to rwsem asm stubs. The patch which has added call_rwsem_down_write_failed_killable() was based on an older tree so it didn't know about annotations. Let's add them. This addresses the following objtool warning: arch/x86/lib/rwsem.o: warning: objtool: call_rwsem_down_write_failed_killable()+0xe: call without frame pointer save/setup Reported-by: Ingo Molnar Signed-off-by: Michal Hocko Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1460541432-21631-1-git-send-email-mhocko@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/lib/rwsem.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 4534a7e912f31..a37462a23546f 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -107,10 +107,12 @@ ENTRY(call_rwsem_down_write_failed) ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_down_write_failed_killable) + FRAME_BEGIN save_common_regs movq %rax,%rdi call rwsem_down_write_failed_killable restore_common_regs + FRAME_END ret ENDPROC(call_rwsem_down_write_failed_killable) -- GitLab From 4252db10559fc3d1efc1e43613254fdd220b014b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 20 Apr 2016 13:55:42 -0700 Subject: [PATCH 301/705] x86/KASLR: Update description for decompressor worst case size The comment that describes the analysis for the size of the decompressor code only took gzip into account (there are currently 6 other decompressors that could be used). The actual z_extract_offset calculation in code was already handling the correct maximum size, but this documentation hadn't been updated. This updates the documentation, fixes several typos, moves the comment to header.S, updates references, and adds a note at the end of the decompressor include list to remind us about updating the comment in the future. (Instead of moving the comment to mkpiggy.c, where the calculation is currently happening, it is being moved to header.S because the calculations in mkpiggy.c will be removed in favor of header.S calculations in a following patch, and it seemed like overkill to move the giant comment twice, especially when there's already reference to z_extract_offset in header.S.) Signed-off-by: Baoquan He [ Rewrote changelog, cleaned up comment style, moved comments around. ] Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 2 +- arch/x86/boot/compressed/misc.c | 89 +++----------------------------- arch/x86/boot/header.S | 88 +++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 82 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9c29e7885ef09..7d86c5dd8e996 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -155,7 +155,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). + * decompression (see calculations in ../header.S). */ unsafe_len = (output_size >> 12) + 32768 + 18; unsafe = (unsigned long)input + input_size - unsafe_len; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index ad8c01ac28854..e96829bdb6d21 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,90 +14,13 @@ #include "misc.h" #include "../string.h" -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ - /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ #define STATIC static #undef memcpy @@ -148,6 +71,10 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ static void scroll(void) { diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6236b9ec4b764..fd85b9e4e9530 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,6 +440,94 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr +# +# Getting to provably safe in-place decompression is hard. Worst case +# behaviours need to be analyzed. Here let's take the decompression of +# a gzip-compressed kernel as example, to illustrate it: +# +# The file layout of gzip compressed kernel is: +# +# magic[2] +# method[1] +# flags[1] +# timestamp[4] +# extraflags[1] +# os[1] +# compressed data blocks[N] +# crc[4] orig_len[4] +# +# ... resulting in +18 bytes overhead of uncompressed data. +# +# (For more information, please refer to RFC 1951 and RFC 1952.) +# +# Files divided into blocks +# 1 bit (last block flag) +# 2 bits (block type) +# +# 1 block occurs every 32K -1 bytes or when there 50% compression +# has been achieved. The smallest block type encoding is always used. +# +# stored: +# 32 bits length in bytes. +# +# fixed: +# magic fixed tree. +# symbols. +# +# dynamic: +# dynamic tree encoding. +# symbols. +# +# +# The buffer for decompression in place is the length of the uncompressed +# data, plus a small amount extra to keep the algorithm safe. The +# compressed data is placed at the end of the buffer. The output pointer +# is placed at the start of the buffer and the input pointer is placed +# where the compressed data starts. Problems will occur when the output +# pointer overruns the input pointer. +# +# The output pointer can only overrun the input pointer if the input +# pointer is moving faster than the output pointer. A condition only +# triggered by data whose compressed form is larger than the uncompressed +# form. +# +# The worst case at the block level is a growth of the compressed data +# of 5 bytes per 32767 bytes. +# +# The worst case internal to a compressed block is very hard to figure. +# The worst case can at least be bounded by having one bit that represents +# 32764 bytes and then all of the rest of the bytes representing the very +# very last byte. +# +# All of which is enough to compute an amount of extra data that is required +# to be safe. To avoid problems at the block level allocating 5 extra bytes +# per 32767 bytes of data is sufficient. To avoid problems internal to a +# block adding an extra 32767 bytes (the worst case uncompressed block size) +# is sufficient, to ensure that in the worst case the decompressed data for +# block will stop the byte before the compressed data for a block begins. +# To avoid problems with the compressed data's meta information an extra 18 +# bytes are needed. Leading to the formula: +# +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size +# +# Adding 8 bytes per 32K is a bit excessive but much easier to calculate. +# Adding 32768 instead of 32767 just makes for round numbers. +# Adding the decompressor_size is necessary as it musht live after all +# of the data as well. Last I measured the decompressor is about 14K. +# 10K of actual data and 4K of bss. +# +# Above analysis is for decompressing gzip compressed kernel only. Up to +# now 6 different decompressor are supported all together. And among them +# xz stores data in chunks and has maximum chunk of 64K. Hence safety +# margin should be updated to cover all decompressors so that we don't +# need to deal with each of them separately. Please check +# the description in lib/decompressor_xxx.c for specific information. +# +# extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# Note that this calculation, which results in z_extract_offset (below), +# is currently generated in compressed/mkpiggy.c + #define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -- GitLab From e8581e3d67788b6b29d055fa42c6cb5b258fee64 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 20 Apr 2016 13:55:43 -0700 Subject: [PATCH 302/705] x86/KASLR: Drop CONFIG_RANDOMIZE_BASE_MAX_OFFSET Currently CONFIG_RANDOMIZE_BASE_MAX_OFFSET is used to limit the maximum offset for kernel randomization. This limit doesn't need to be a CONFIG since it is tied completely to KERNEL_IMAGE_SIZE, and will make no sense once physical and virtual offsets are randomized separately. This patch removes CONFIG_RANDOMIZE_BASE_MAX_OFFSET and consolidates the Kconfig help text. [kees: rewrote changelog, dropped KERNEL_IMAGE_SIZE_DEFAULT, rewrote help] Signed-off-by: Baoquan He Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 72 +++++++++++----------------- arch/x86/boot/compressed/kaslr.c | 12 ++--- arch/x86/include/asm/page_64_types.h | 8 ++-- arch/x86/mm/init_32.c | 3 -- 4 files changed, 36 insertions(+), 59 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dc18605831f6..5892d549596d6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1932,54 +1932,38 @@ config RELOCATABLE (CONFIG_PHYSICAL_START) is used as the minimum location. config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. + + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) + + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. + + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation). - Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. - - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. - - If unsure, say N. - -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. - - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 7d86c5dd8e996..3ad71a0afa242 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -217,15 +217,13 @@ static bool mem_avoid_overlap(struct mem_vector *img) return false; } -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; +static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; static unsigned long slot_max; static void slots_append(unsigned long addr) { /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) + if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) return; slots[slot_max++] = addr; @@ -251,7 +249,7 @@ static void process_e820_entry(struct e820entry *entry, return; /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) + if (entry->addr >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ @@ -276,8 +274,8 @@ static void process_e820_entry(struct e820entry *entry, region.size -= region.start - entry->addr; /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; + if (region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; /* Walk each aligned slot and check for avoided areas. */ for (img.start = region.start, img.size = image_size ; diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4928cf0d5af0f..d5c2f8b40faab 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,12 +47,10 @@ * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) -#if defined(CONFIG_RANDOMIZE_BASE) && \ - CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT -#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#if defined(CONFIG_RANDOMIZE_BASE) +#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else -#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd7a9b9e2e14a..f2ee42d61894d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -804,9 +804,6 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP -#ifdef CONFIG_RANDOMIZE_BASE - BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); -- GitLab From 1f208de37d10bb9067f3b061d281363be0cd1805 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:44 -0700 Subject: [PATCH 303/705] x86/boot: Clean up things used by decompressors This rearranges the pieces needed to include the decompressor code in misc.c. It wasn't obvious why things were there, so a comment was added and definitions consolidated. Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e96829bdb6d21..0381e250a7851 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -21,19 +21,19 @@ * it is not safe to place pointers in static structures. */ +/* Macros used by the included decompressor code below. */ #define STATIC static -#undef memcpy - /* - * Use a normal definition of memset() from string.c. There are already + * Use normal definitions of mem*() from string.c. There are already * included header files which expect a definition of memset() and by * the time we define memset macro, it is too late. */ +#undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) - +/* Functions used by the included decompressor code below. */ static void error(char *m); /* -- GitLab From bf0118dbba9542ceb5d33d4a86830a6c88b0bbf6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:45 -0700 Subject: [PATCH 304/705] x86/boot: Make memcpy() handle overlaps Two uses of memcpy() (screen scrolling and ELF parsing) were handling overlapping memory areas. While there were no explicitly noticed bugs here (yet), it is best to fix this so that the copying will always be safe. Instead of making a new memmove() function that might collide with other memmove() definitions in the decompressors, this just makes the compressed boot code's copy of memcpy() overlap-safe. Suggested-by: Lasse Collin Reported-by: Yinghai Lu Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1461185746-8017-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 4 +--- arch/x86/boot/compressed/string.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0381e250a7851..eacc855ae08e7 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -301,9 +301,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memcpy(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788be1db94..1e10e40f49dd5 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,7 @@ #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -39,3 +39,21 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +/* + * This memcpy is overlap safe (i.e. it is memmove without conflicting + * with other definitions of memmove from the various decompressors. + */ +void *memcpy(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return __memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} -- GitLab From 0f8ede1b8c4cb845c53072d7e49d71ca24a61ced Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Apr 2016 13:55:46 -0700 Subject: [PATCH 305/705] x86/KASLR: Warn when KASLR is disabled If KASLR is built in but not available at run-time (either due to the current conflict with hibernation, command-line request, or e820 parsing failures), announce the state explicitly. To support this, a new "warn" function is created, based on the existing "error" function. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1461185746-8017-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 6 +++--- arch/x86/boot/compressed/misc.c | 12 +++++++++--- arch/x86/boot/compressed/misc.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 3ad71a0afa242..8741a6d83bfee 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -314,12 +314,12 @@ unsigned char *choose_random_location(unsigned char *input, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); + warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); + warn("KASLR disabled: 'nokaslr' on cmdline."); goto out; } #endif @@ -333,7 +333,7 @@ unsigned char *choose_random_location(unsigned char *input, /* Walk e820 and find a random address. */ random_addr = find_random_addr(choice, output_size); if (!random_addr) { - debug_putstr("KASLR could not find suitable E820 region...\n"); + warn("KASLR disabled: could not find suitable E820 region!"); goto out; } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index eacc855ae08e7..c57d785ff9552 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -166,11 +166,17 @@ void __puthex(unsigned long value) } } -static void error(char *x) +void warn(char *m) { error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); + error_putstr(m); + error_putstr("\n\n"); +} + +static void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); while (1) asm("hlt"); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9887e0d4aaeb9..e75f6cf9caafa 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -35,6 +35,7 @@ extern memptr free_mem_end_ptr; extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); +void warn(char *m); #define error_putstr(__x) __putstr(__x) #define error_puthex(__x) __puthex(__x) -- GitLab From 18c78a96239749fc4aaee84ca1eb5a8e81f2601d Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:31 -0700 Subject: [PATCH 306/705] x86/boot: Enumerate documentation for the x86 hardware_subarch Although hardware_subarch has been in place since the x86 boot protocol 2.07 it hasn't been used much. Enumerate current possible values to avoid misuses and help with semantics later at boot time should this be used further. These enums should only ever be used by architecture x86 code, and all that code should be well contained and compartamentalized, clarify that as well. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-2-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/bootparam.h | 41 ++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 329254373479a..c18ce67495fad 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -157,7 +157,46 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -enum { +/** + * enum x86_hardware_subarch - x86 hardware subarchitecture + * + * The x86 hardware_subarch and hardware_subarch_data were added as of the x86 + * boot protocol 2.07 to help distinguish and support custom x86 boot + * sequences. This enum represents accepted values for the x86 + * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not + * have or simply *cannot* make use of natural stubs like BIOS or EFI, the + * hardware_subarch can be used on the Linux entry path to revector to a + * subarchitecture stub when needed. This subarchitecture stub can be used to + * set up Linux boot parameters or for special care to account for nonstandard + * handling of page tables. + * + * These enums should only ever be used by x86 code, and the code that uses + * it should be well contained and compartamentalized. + * + * KVM and Xen HVM do not have a subarch as these are expected to follow + * standard x86 boot entries. If there is a genuine need for "hypervisor" type + * that should be considered separately in the future. Future guest types + * should seriously consider working with standard x86 boot stubs such as + * the BIOS or EFI boot stubs. + * + * WARNING: this enum is only used for legacy hacks, for platform features that + * are not easily enumerated or discoverable. You should not ever use + * this for new features. + * + * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard + * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, + * which start at asm startup_xen() entry point and later jump to the C + * xen_start_kernel() entry point. Both domU and dom0 type of guests are + * currently supportd through this PV boot path. + * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform + * systems which do not have the PCI legacy interfaces. + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * for settop boxes and media devices, the use of a subarch for CE4100 + * is more of a hack... + */ +enum x86_hardware_subarch { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, -- GitLab From ea1794812410e92c537c839bedeb2d2b2f87c80d Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:32 -0700 Subject: [PATCH 307/705] x86/xen: Use X86_SUBARCH_XEN for PV guest boots The use of subarch should have no current effect on Xen PV guests, as such this should have no current functional effects. Signed-off-by: Luis R. Rodriguez Reviewed-by: David Vrabel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-3-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c7d9ddb..61f4d9f67f60f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1670,6 +1670,7 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); -- GitLab From 907bb655797427cc6498045d6977e77f8363fff7 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:33 -0700 Subject: [PATCH 308/705] tools/lguest: Make lguest launcher use X86_SUBARCH_LGUEST explicitly Be explicit and make use of X86_SUBARCH_LGUEST directly. Signed-off-by: Luis R. Rodriguez Acked-by: Rusty Russell Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-4-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- tools/lguest/lguest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 80159e6811c29..ff0aa580c6e1b 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3351,8 +3351,8 @@ int main(int argc, char *argv[]) /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; + /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; -- GitLab From 8d152e7a5c7537b18b4e9e0eb96f549b016636dc Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:34 -0700 Subject: [PATCH 309/705] x86/rtc: Replace paravirt rtc check with platform legacy quirk We have 4 types of x86 platforms that disable RTC: * Intel MID * Lguest - uses paravirt * Xen dom-U - uses paravirt * x86 on legacy systems annotated with an ACPI legacy flag We can consolidate all of these into a platform specific legacy quirk set early in boot through i386_start_kernel() and through x86_64_start_reservations(). This deals with the RTC quirks which we can rely on through the hardware subarch, the ACPI check can be dealt with separately. For Xen things are bit more complex given that the @X86_SUBARCH_XEN x86_hardware_subarch is shared on for Xen which uses the PV path for both domU and dom0. Since the semantics for differentiating between the two are Xen specific we provide a platform helper to help override default legacy features -- x86_platform.set_legacy_features(). Use of this helper is highly discouraged, its only purpose should be to account for the lack of semantics available within your given x86_hardware_subarch. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +70 +62 +62 +43 Only 8 bytes overhead total, as the main increase in size is all removed via __init. Suggested-by: Ingo Molnar Signed-off-by: Luis R. Rodriguez Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-5-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 1 + arch/x86/include/asm/paravirt.h | 6 ------ arch/x86/include/asm/paravirt_types.h | 5 ----- arch/x86/include/asm/processor.h | 1 - arch/x86/include/asm/x86_init.h | 21 +++++++++++++++++++++ arch/x86/kernel/Makefile | 6 +++++- arch/x86/kernel/head32.c | 2 ++ arch/x86/kernel/head64.c | 1 + arch/x86/kernel/platform-quirks.c | 21 +++++++++++++++++++++ arch/x86/kernel/rtc.c | 7 ++----- arch/x86/lguest/boot.c | 1 - arch/x86/xen/enlighten.c | 10 +++++++--- 12 files changed, 60 insertions(+), 22 deletions(-) create mode 100644 arch/x86/kernel/platform-quirks.c diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b323..f9ed8a7ce2b61 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -209,6 +209,7 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 601f1b8f9961a..6c7a4a1920328 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -20,12 +20,6 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } -static inline int paravirt_has_feature(unsigned int feature) -{ - WARN_ON_ONCE(!pv_info.paravirt_enabled); - return (pv_info.features & feature); -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e8c2326478c8f..6acc1b26cf40a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,14 +70,9 @@ struct pv_info { #endif int paravirt_enabled; - unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; -#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) -/* Supported features */ -#define PV_SUPPORTED_RTC (1<<0) - struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9264476f3d578..0c70c7daa6b83 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -474,7 +474,6 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 -#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ae89a2721d6f..8bb8c1a4615a4 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -141,6 +141,15 @@ struct x86_cpuinit_ops { struct timespec; +/** + * struct x86_legacy_features - legacy x86 features + * + * @rtc: this device has a CMOS real-time clock present + */ +struct x86_legacy_features { + int rtc; +}; + /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC @@ -152,6 +161,14 @@ struct timespec; * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if neeeded + * @legacy: legacy features + * @set_legacy_features: override legacy features. Use of this callback + * is highly discouraged. You should only need + * this if your hardware platform requires further + * custom fine tuning far beyong what may be + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -165,6 +182,8 @@ struct x86_platform_ops { void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); }; struct pci_dev; @@ -186,6 +205,8 @@ extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; extern struct x86_io_apic_ops x86_io_apic_ops; + +extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd22ef9a2..b81b22ee10bab 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,11 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += head.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3a9f1c7..d784bb547a9dd 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_early_init_platform_quirks(); + /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f4422d5c8d01..b72fb0b71dd1f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_early_init_platform_quirks(); reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 0000000000000..021a5f973ce32 --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,21 @@ +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.rtc = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_XEN: + case X86_SUBARCH_LGUEST: + case X86_SUBARCH_INTEL_MID: + x86_platform.legacy.rtc = 0; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4af8d063fb362..62c48da3889db 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 /* @@ -188,10 +189,6 @@ static __init int add_rtc_cmos(void) if (of_have_populated_dt()) return 0; - /* Intel MID platforms don't have ioport rtc */ - if (intel_mid_identify_cpu()) - return -ENODEV; - #ifdef CONFIG_ACPI if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { /* This warning can likely go away again in a year or two. */ @@ -200,7 +197,7 @@ static __init int add_rtc_cmos(void) } #endif - if (paravirt_enabled() && !paravirt_has(RTC)) + if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index fd57d3ae7e16d..f5497ee5fd2f5 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,7 +1414,6 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 61f4d9f67f60f..752029d571bf9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1193,7 +1193,6 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - .features = 0, .name = "Xen", }; @@ -1506,6 +1505,11 @@ static void __init xen_pvh_early_guest_init(void) } #endif /* CONFIG_XEN_PVH */ +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1527,8 +1531,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - if (xen_initial_domain()) - pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1688,6 +1690,8 @@ asmlinkage __visible void __init xen_start_kernel(void) .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, }; + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; -- GitLab From 088a8ef8207f19aadbade0971af21ad89fdc3815 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:35 -0700 Subject: [PATCH 310/705] x86/ACPI: Move ACPI_FADT_NO_CMOS_RTC check to ACPI boot code This moves the ACPI specific check into the ACPI boot code, it also takes advantage of the x86_platform.legacy.rtc which is checked for already on the RTC initialization code. This lets us remove the nasty #ifdefery and consolidate the checks to use only one toggle to disable the RTC init code. The works as RTC is initialized by device_initcall(add_rtc_cmos), this will run late in boot on start_kernel() during rest_init(), acpi_parse_fadt() gets called earlier during setup_arch(). Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-6-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 ++++ arch/x86/kernel/rtc.c | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c2f1ef6ca236..8c9c2bdba092b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -913,6 +913,10 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 62c48da3889db..ff4f4180fefd2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -189,14 +189,6 @@ static __init int add_rtc_cmos(void) if (of_have_populated_dt()) return 0; -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - /* This warning can likely go away again in a year or two. */ - pr_info("ACPI: not registering RTC platform device\n"); - return -ENODEV; - } -#endif - if (!x86_platform.legacy.rtc) return -ENODEV; -- GitLab From 1330e3bc544a1951d81b7f3c7d4cecf77d906f67 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:36 -0700 Subject: [PATCH 311/705] x86/init: Use a platform legacy quirk for EBDA This replaces the paravirt_enabled() check with a proper x86 legacy platform quirk. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +39 +35 +35 +25 That's a 4 byte total overhead, the rest is all cleared out upon init as its all __init text. v2: document 0-day vmlinux size impact Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-7-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/head.c | 2 +- arch/x86/kernel/platform-quirks.c | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8bb8c1a4615a4..89d9d57e145df 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,12 @@ struct timespec; * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present + * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * low RAM */ struct x86_legacy_features { int rtc; + int ebda_search; }; /** diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 992f442ca1551..afe65dffee80b 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -38,7 +38,7 @@ void __init reserve_ebda_region(void) * that the paravirt case can handle memory setup * correctly, without our help. */ - if (paravirt_enabled()) + if (!x86_platform.legacy.ebda_search) return; /* end of low (conventional) memory */ diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 021a5f973ce32..01b159781d96c 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,8 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; + x86_platform.legacy.ebda_search = 0; switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.ebda_search = 1; + break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: case X86_SUBARCH_INTEL_MID: -- GitLab From 46504590321dc62a11065f8d00e1b12037c37018 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:37 -0700 Subject: [PATCH 312/705] tools/lguest: Force disable tboot and APM The paravirt_enabled() check is going away, the area tossed to the kernel on lguest is not zeroed out, so ensure lguest force disables tboot and APM just in case the kernel file being read might have this set for whatever reason. Signed-off-by: Luis R. Rodriguez Acked-by: Rusty Russell Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-8-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- tools/lguest/lguest.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index ff0aa580c6e1b..d9836c5eb694c 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3357,6 +3357,12 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; + /* We don't support tboot: */ + boot->tboot_addr = 0; + + /* Ensure this is 0 to prevent APM from loading: */ + boot->apm_bios_info.version = 0; + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); -- GitLab From 8bc55f805697ec2a69c6a576fac8ee36ea9772bb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:38 -0700 Subject: [PATCH 313/705] x86/apm32: Remove paravirt_enabled() use There is already a check for apm_info.bios == 0, the apm_info.bios is set from the boot_params.apm_bios_info. Both Xen and lguest, which are also the only ones that set paravirt_enabled to true, never set the apm_bios.info. The Xen folks are sure force disable to 0 is not needed because apm_info lives in .bss, we recently forced disabled this on lguest, and on the Xen side just to be sure Boris zeroed out the .bss for PV guests through commit 04b6b4a56884327c1648 ("xen/x86: Zero out .bss for PV guests"). With this care taken into consideration the paravirt_enabled() check is simply not needed anymore. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-9-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9307f182fe304..c7364bd633e1d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2267,7 +2267,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + if (apm_info.bios.version == 0 || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } -- GitLab From 44ecf0ef907fe45510566d308d670aa5823a4dd5 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:39 -0700 Subject: [PATCH 314/705] x86/tboot: Remove paravirt_enabled() use There is already a check for boot_params.tboot_addr prior to paravirt_enabled(). Both Xen and lguest, which are also the only ones that set paravirt_enabled to true, never set the boot_params.tboot_addr. The Xen folks are sure a force disable to 0 is not needed, we recently forced disabled this on lguest. With this in place this check is no longer needed. Xen folks are sure force disable to 0 is not needed because apm_info lives in .bss, we recently forced disabled this on lguest, and on the Xen side just to be sure Boris zeroed out the .bss for PV guests through commit 04b6b4a56884327c1648 ("xen/x86: Zero out .bss for PV guests"). With this care taken into consideration the paravirt_enabled() check is simply not needed anymore. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-10-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tboot.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e72a07f20b05c..9b0185fbe3eb4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -74,12 +74,6 @@ void __init tboot_probe(void) return; } - /* only a natively booted kernel should be using TXT */ - if (paravirt_enabled()) { - pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); - return; - } - /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); -- GitLab From fa392794ed8329379f3f637da7c3c2f078309a77 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:40 -0700 Subject: [PATCH 315/705] x86/cpu/intel: Remove not needed paravirt_enabled() use for F00F work around The X86_BUG_F00F work around is responsible for fixing up the error generated on attempted F00F exploitation from an OOPS to a SIGILL. There is no reason why this code should not be allowed to run on PV guest on a F00F-affected CPU -- it would simply never trigger. The pv_enabled() check was there only to avoid printing the f00f workaround, so removing the check is purely a cosmetic change. Suggested-by: Andy Lutomirski Signed-off-by: Luis R. Rodriguez Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-11-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1f7fdb91a818b..016b3d9ffa7d7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -233,7 +233,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { + if (c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); -- GitLab From 80dfd83dfab6e49a31ab8fc484a801aef1c567bd Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:41 -0700 Subject: [PATCH 316/705] x86, drivers/pnpbios: Replace paravirt_enabled() check with legacy device check Since we are removing paravirt_enabled() replace it with a logical equivalent. Even though PNPBIOS is x86 specific we add an arch-specific type call, which can be implemented by any architecture to show how other legacy attribute devices can later be also checked for with other ACPI legacy attribute flags. This implicates the first ACPI 5.2.9.3 IA-PC Boot Architecture ACPI_FADT_LEGACY_DEVICES flag device, and shows how to add more. The reason pnpbios gets a defined structure and as such uses a different approach than the RTC legacy quirk is that ACPI has a respective RTC flag, while pnpbios does not. We fold the pnpbios quirk under ACPI_FADT_LEGACY_DEVICES ACPI flag use case, and use a struct of possible devices to enable future extensions of this. As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() +32 +28 +28 +28 That's 4 byte overhead total, the rest is cleared out on init as its all __init text. v2: split out subarch handlng on switch to make it easier later to add other subarchs. The 'fall-through' switch handling can be confusing and we'll remove it later when we add handling for X86_SUBARCH_CE4100. v3: document vmlinux size impact as per 0-day, and also explain why pnpbios is treated differently than the RTC legacy feature. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-12-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 26 ++++++++++++++++++++++++++ arch/x86/kernel/platform-quirks.c | 11 +++++++++++ drivers/pnp/pnpbios/core.c | 3 ++- include/linux/pnp.h | 2 ++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 89d9d57e145df..4dcdf74dfed86 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -141,16 +141,42 @@ struct x86_cpuinit_ops { struct timespec; +/** + * struct x86_legacy_devices - legacy x86 devices + * + * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform + * is known to never have a PNPBIOS. + * + * These are devices known to require LPC or ISA bus. The definition of legacy + * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag + * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on + * the LPC or ISA bus. User visible devices are devices that have end-user + * accessible connectors (for example, LPT parallel port). Legacy devices on + * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard + * / mouse, and the floppy disk controller. A system that lacks all known + * legacy devices can assume all devices can be detected exclusively via + * standard device enumeration mechanisms including the ACPI namespace. + * + * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not + * have any of the legacy devices enumerated below present. + */ +struct x86_legacy_devices { + int pnpbios; +}; + /** * struct x86_legacy_features - legacy x86 features * * @rtc: this device has a CMOS real-time clock present * @ebda_search: it's safe to search for the EBDA signature in the hardware's * low RAM + * @devices: legacy x86 devices, refer to struct x86_legacy_devices + * documentation for further details. */ struct x86_legacy_features { int rtc; int ebda_search; + struct x86_legacy_devices devices; }; /** diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 01b159781d96c..ab643825a7aae 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -8,6 +8,7 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: @@ -15,6 +16,9 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; case X86_SUBARCH_INTEL_MID: x86_platform.legacy.rtc = 0; break; @@ -23,3 +27,10 @@ void __init x86_early_init_platform_quirks(void) if (x86_platform.set_legacy_features) x86_platform.set_legacy_features(); } + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} +#endif diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index facd43b8516cf..81603d99082be 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -521,10 +521,11 @@ static int __init pnpbios_init(void) int ret; if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || - paravirt_enabled()) { + arch_pnpbios_disabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } + #ifdef CONFIG_PNPACPI if (!acpi_disabled && !pnpacpi_disabled) { pnpbios_disabled = 1; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 5df733b8f704d..2588ca6a9028d 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex; #ifdef CONFIG_PNPBIOS extern struct pnp_protocol pnpbios_protocol; +extern bool arch_pnpbios_disabled(void); #define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol)) #else #define pnp_device_is_pnpbios(dev) 0 +#define arch_pnpbios_disabled() false #endif #ifdef CONFIG_PNPACPI -- GitLab From 7a17b82ccd6671a4bb436df52eedeff906b02735 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:42 -0700 Subject: [PATCH 317/705] x86/ACPI: Parse ACPI_FADT_LEGACY_DEVICES ACPI 5.2.9.3 IA-PC Boot Architecture flag ACPI_FADT_LEGACY_DEVICES can be used to determine if a system has legacy devices LPC or ISA devices. The x86 platform already has a struct which lists known associated legacy devices, we start off careful only by disabling root devices we should not regress with. The struct and device list can be expanded with time to cover more root legacy components. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-13-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c9c2bdba092b..c9a06e573fa51 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -913,6 +913,11 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { pr_debug("ACPI: not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; -- GitLab From f2d85299b7f11f73cc0a294e396cdae114e75787 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:43 -0700 Subject: [PATCH 318/705] x86/init: Rename EBDA code file This makes it clearer what this is. Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-14-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/{head.c => ebda.c} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename arch/x86/kernel/{head.c => ebda.c} (100%) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f9ed8a7ce2b61..6fce7f096b889 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,7 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o -head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/ebda.o head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b81b22ee10bab..9abf8551c7e4d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head_$(BITS).o extra-y += head$(BITS).o -extra-y += head.o +extra-y += ebda.o extra-y += platform-quirks.o extra-y += vmlinux.lds diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c similarity index 100% rename from arch/x86/kernel/head.c rename to arch/x86/kernel/ebda.c -- GitLab From 867fe800b4c423bce46e66ccb2ce91bebbd5afc6 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:44 -0700 Subject: [PATCH 319/705] x86/paravirt: Remove paravirt_enabled() Now that all previous paravirt_enabled() uses were replaced with proper x86 semantics by the previous patches we can remove the unused paravirt_enabled() mechanism. Signed-off-by: Luis R. Rodriguez Acked-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-15-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 5 ----- arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/kvm.c | 8 -------- arch/x86/kernel/paravirt.c | 1 - arch/x86/lguest/boot.c | 2 -- arch/x86/xen/enlighten.c | 1 - 7 files changed, 19 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c7a4a1920328..dff26bc91b172 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,11 +15,6 @@ #include #include -static inline int paravirt_enabled(void) -{ - return pv_info.paravirt_enabled; -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 6acc1b26cf40a..7fedf24bd8118 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -69,7 +69,6 @@ struct pv_info { u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif - int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0c70c7daa6b83..8d326e822cb85 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -473,7 +473,6 @@ static inline unsigned long current_top_of_stack(void) #include #else #define __cpuid native_cpuid -#define paravirt_enabled() 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 807950860fb70..c66546f29b819 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - /* - * KVM isn't paravirt in the sense of paravirt_enabled. A KVM - * guest kernel works like a bare metal kernel with additional - * features, and paravirt_enabled is about features that are - * missing. - */ - pv_info.paravirt_enabled = 0; - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f08ac28b8136d..71a2d8a05a663 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", - .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f5497ee5fd2f5..3847e736702e1 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1408,8 +1408,6 @@ __init void lguest_init(void) { /* We're under lguest. */ pv_info.name = "lguest"; - /* Paravirt is enabled. */ - pv_info.paravirt_enabled = 1; /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 752029d571bf9..5fc20a1108c77 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1187,7 +1187,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, } static const struct pv_info xen_info __initconst = { - .paravirt_enabled = 1, .shared_kernel_pmd = 0, #ifdef CONFIG_X86_64 -- GitLab From f6935b7bfbf8345bea05f73dc48ce81b70f016e0 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:45 -0700 Subject: [PATCH 320/705] x86/init: Disable pnpbios for X86_SUBARCH_INTEL_MID As per hpa Intel MID platforms can also disable pnpbios: ttp://lkml.kernel.org/r/5702B5C2.7070101@zytor.com As per 0-day, this bumps the vmlinux size using i386-tinyconfig as follows: TOTAL TEXT init.text x86_early_init_platform_quirks() -8 -8 -8 -8 Suggested-by: H. Peter Anvin Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-16-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/platform-quirks.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index ab643825a7aae..8539194843400 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,10 +16,8 @@ void __init x86_early_init_platform_quirks(void) break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: - x86_platform.legacy.devices.pnpbios = 0; - x86_platform.legacy.rtc = 0; - break; case X86_SUBARCH_INTEL_MID: + x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; } -- GitLab From a50b22a7a1e60c48ca26cada362076b54823c501 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 13 Apr 2016 17:04:46 -0700 Subject: [PATCH 321/705] x86/init: Disable pnpbios and rtc for X86_SUBARCH_CE4100 As per hpa CE4100 platforms can also disable pnpbios: http://lkml.kernel.org/r/5702B5C2.7070101@zytor.com Then Sebastian also recently noted that CE4100 also disables RTC probe, to do that Sebastian had long ago added the RTC of_have_populated_dt() check, he noted that it was meant to skip the RTC probe on all OF platforms but as of now, CE4100 was the only x86 DT using this. We can just fold this requirement into the platform quirk then. This now means that all of these match platform quirks for pnpbios and RTC preferences: * X86_SUBARCH_XEN * X86_SUBARCH_LGUEST * X86_SUBARCH_INTEL_MID * X86_SUBARCH_CE4100 Also see: http://lkml.kernel.org/r/570B52EA.60300@linutronix.de Suggested-by: H. Peter Anvin Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Luis R. Rodriguez Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrew.cooper3@citrix.com Cc: andriy.shevchenko@linux.intel.com Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: ffainelli@freebox.fr Cc: george.dunlap@citrix.com Cc: glin@suse.com Cc: jgross@suse.com Cc: jlee@suse.com Cc: josh@joshtriplett.org Cc: julien.grall@linaro.org Cc: konrad.wilk@oracle.com Cc: kozerkov@parallels.com Cc: lenb@kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-acpi@vger.kernel.org Cc: lv.zheng@intel.com Cc: matt@codeblueprint.co.uk Cc: mbizon@freebox.fr Cc: rjw@rjwysocki.net Cc: robert.moore@intel.com Cc: rusty@rustcorp.com.au Cc: tiwai@suse.de Cc: toshi.kani@hp.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1460592286-300-17-git-send-email-mcgrof@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/platform-quirks.c | 1 + arch/x86/kernel/rtc.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 8539194843400..b2f8a33b36ff4 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -17,6 +17,7 @@ void __init x86_early_init_platform_quirks(void) case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index ff4f4180fefd2..eceaa082ec3fc 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -186,9 +186,6 @@ static __init int add_rtc_cmos(void) } } #endif - if (of_have_populated_dt()) - return 0; - if (!x86_platform.legacy.rtc) return -ENODEV; -- GitLab From 8b92c3a78d40fb220dc5ab122e3274d1b126bfbb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 15 Apr 2016 00:42:47 -0700 Subject: [PATCH 322/705] perf/x86/intel: Add Goldmont CPU support Add perf core PMU support for Intel Goldmont CPU cores: - The init code is based on Silvermont. - There is a new cache event list, based on the Silvermont cache event list. - Goldmont has 32 LBR entries. It also uses new LBRv6 format, which report the cycle information using upper 16-bit of the LBR_TO. - It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS for precise cycles. For details, please refer to the latest SDM058: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-3b-part-2-manual.pdf Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1460706167-45320-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 157 +++++++++++++++++++++++++++++++++++ arch/x86/events/intel/ds.c | 6 ++ arch/x86/events/intel/lbr.c | 13 ++- arch/x86/events/perf_event.h | 2 + 4 files changed, 177 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index aff79884e17d2..92fda6bb779ea 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1465,6 +1465,140 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +static struct extra_reg intel_glm_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1), + EVENT_EXTRA_END +}; + +#define GLM_DEMAND_DATA_RD BIT_ULL(0) +#define GLM_DEMAND_RFO BIT_ULL(1) +#define GLM_ANY_RESPONSE BIT_ULL(16) +#define GLM_SNP_NONE_OR_MISS BIT_ULL(33) +#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD +#define GLM_DEMAND_WRITE GLM_DEMAND_RFO +#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) +#define GLM_LLC_ACCESS GLM_ANY_RESPONSE +#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM) +#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 glm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */ + [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + }, + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */ + [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, + [C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, +}; + +static __initconst const u64 glm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_READ| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_READ| + GLM_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_WRITE| + GLM_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_MISS, + }, + }, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -3456,6 +3590,29 @@ __init int intel_pmu_init(void) pr_cont("Silvermont events, "); break; + case 92: /* 14nm Atom "Goldmont" */ + case 95: /* 14nm Atom "Goldmont Denverton" */ + memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints; + x86_pmu.extra_regs = intel_glm_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + * :pp is identical to :ppp + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + pr_cont("Goldmont events, "); + break; + case 37: /* 32nm Westmere */ case 44: /* 32nm Westmere-EP */ case 47: /* 32nm Westmere-EX */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8584b90d8e0bb..7ce9f3f669e63 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -645,6 +645,12 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_glm_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 6c3b7c1780c98..ad26ca770c981 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -14,7 +14,8 @@ enum { LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, }; static enum { @@ -464,6 +465,16 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) abort = !!(info & LBR_INFO_ABORT); cycles = (info & LBR_INFO_CYCLES); } + + if (lbr_format == LBR_FORMAT_TIME) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + cycles = ((to >> 48) & LBR_INFO_CYCLES); + + to = (u64)((((s64)to) << 16) >> 16); + } + if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ad4dc7ffffb5e..8b78481d1e641 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -859,6 +859,8 @@ extern struct event_constraint intel_atom_pebs_event_constraints[]; extern struct event_constraint intel_slm_pebs_event_constraints[]; +extern struct event_constraint intel_glm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; -- GitLab From f21d5adceb7f2660e5227569faed278f6fb2072e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 15 Apr 2016 00:53:45 -0700 Subject: [PATCH 323/705] perf/x86/intel: Add LBR filter support for Silvermont and Airmont CPUs LBR filtering is also supported on the Silvermont and Airmont microarchitectures. The layout of MSR_LBR_SELECT is the same as Nehalem. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1460706825-46163-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/lbr.c | 18 ++++++++++++++++++ arch/x86/events/perf_event.h | 2 ++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 92fda6bb779ea..79b59437f5ee9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3581,7 +3581,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_atom(); + intel_pmu_lbr_init_slm(); x86_pmu.event_constraints = intel_slm_event_constraints; x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index ad26ca770c981..317e29e3869ed 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1058,6 +1058,24 @@ void __init intel_pmu_lbr_init_atom(void) pr_cont("8-deep LBR, "); } +/* slm */ +void __init intel_pmu_lbr_init_slm(void) +{ + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} + /* Knights Landing */ void intel_pmu_lbr_init_knl(void) { diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8b78481d1e641..7d62a02f49a41 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -909,6 +909,8 @@ void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_slm(void); + void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); -- GitLab From 9ecda41acb971ebd07c8fb35faf24005c0baea12 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 5 Apr 2016 14:11:18 +0000 Subject: [PATCH 324/705] perf/core: Add ::write_backward attribute to perf event This patch introduces 'write_backward' bit to perf_event_attr, which controls the direction of a ring buffer. After set, the corresponding ring buffer is written from end to beginning. This feature is design to support reading from overwritable ring buffer. Ring buffer can be created by mapping a perf event fd. Kernel puts event records into ring buffer, user tooling like perf fetch them from address returned by mmap(). To prevent racing between kernel and tooling, they communicate to each other through 'head' and 'tail' pointers. Kernel maintains 'head' pointer, points it to the next free area (tail of the last record). Tooling maintains 'tail' pointer, points it to the tail of last consumed record (record has already been fetched). Kernel determines the available space in a ring buffer using these two pointers to avoid overwrite unfetched records. By mapping without 'PROT_WRITE', an overwritable ring buffer is created. Different from normal ring buffer, tooling is unable to maintain 'tail' pointer because writing is forbidden. Therefore, for this type of ring buffers, kernel overwrite old records unconditionally, works like flight recorder. This feature would be useful if reading from overwritable ring buffer were as easy as reading from normal ring buffer. However, there's an obscure problem. The following figure demonstrates a full overwritable ring buffer. In this figure, the 'head' pointer points to the end of last record, and a long record 'E' is pending. For a normal ring buffer, a 'tail' pointer would have pointed to position (X), so kernel knows there's no more space in the ring buffer. However, for an overwritable ring buffer, kernel ignore the 'tail' pointer. (X) head . | . V +------+-------+----------+------+---+ |A....A|B.....B|C........C|D....D| | +------+-------+----------+------+---+ Record 'A' is overwritten by event 'E': head | V +--+---+-------+----------+------+---+ |.E|..A|B.....B|C........C|D....D|E..| +--+---+-------+----------+------+---+ Now tooling decides to read from this ring buffer. However, none of these two natural positions, 'head' and the start of this ring buffer, are pointing to the head of a record. Even the full ring buffer can be accessed by tooling, it is unable to find a position to start decoding. The first attempt tries to solve this problem AFAIK can be found from [1]. It makes kernel to maintain 'tail' pointer: updates it when ring buffer is half full. However, this approach introduces overhead to fast path. Test result shows a 1% overhead [2]. In addition, this method utilizes no more tham 50% records. Another attempt can be found from [3], which allows putting the size of an event at the end of each record. This approach allows tooling to find records in a backward manner from 'head' pointer by reading size of a record from its tail. However, because of alignment requirement, it needs 8 bytes to record the size of a record, which is a huge waste. Its performance is also not good, because more data need to be written. This approach also introduces some extra branch instructions to fast path. 'write_backward' is a better solution to this problem. Following figure demonstrates the state of the overwritable ring buffer when 'write_backward' is set before overwriting: head | V +---+------+----------+-------+------+ | |D....D|C........C|B.....B|A....A| +---+------+----------+-------+------+ and after overwriting: head | V +---+------+----------+-------+---+--+ |..E|D....D|C........C|B.....B|A..|E.| +---+------+----------+-------+---+--+ In each situation, 'head' points to the beginning of the newest record. From this record, tooling can iterate over the full ring buffer and fetch records one by one. The only limitation that needs to be considered is back-to-back reading. Due to the non-deterministic of user programs, it is impossible to ensure the ring buffer keeps stable during reading. Consider an extreme situation: tooling is scheduled out after reading record 'D', then a burst of events come, eat up the whole ring buffer (one or multiple rounds). When the tooling process comes back, reading after 'D' is incorrect now. To prevent this problem, we need to find a way to ensure the ring buffer is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is suggested because its overhead is lower than ioctl(PERF_EVENT_IOC_ENABLE). By carefully verifying 'header' pointer, reader can avoid pausing the ring-buffer. For example: /* A union of all possible events */ union perf_event event; p = head = perf_mmap__read_head(); while (true) { /* copy header of next event */ fetch(&event.header, p, sizeof(event.header)); /* read 'head' pointer */ head = perf_mmap__read_head(); /* check overwritten: is the header good? */ if (!verify(sizeof(event.header), p, head)) break; /* copy the whole event */ fetch(&event, p, event.header.size); /* read 'head' pointer again */ head = perf_mmap__read_head(); /* is the whole event good? */ if (!verify(event.header.size, p, head)) break; p += event.header.size; } However, the overhead is high because: a) In-place decoding is not safe. Copying-verifying-decoding is required. b) Fetching 'head' pointer requires additional synchronization. (From Alexei Starovoitov: Even when this trick works, pause is needed for more than stability of reading. When we collect the events into overwrite buffer we're waiting for some other trigger (like all cpu utilization spike or just one cpu running and all others are idle) and when it happens the buffer has valuable info from the past. At this point new events are no longer interesting and buffer should be paused, events read and unpaused until next trigger comes.) This patch utilizes event's default overflow_handler introduced previously. perf_event_output_backward() is created as the default overflow handler for backward ring buffers. To avoid extra overhead to fast path, original perf_event_output() becomes __perf_event_output() and marked '__always_inline'. In theory, there's no extra overhead introduced to fast path. Performance testing: Calling 3000000 times of 'close(-1)', use gettimeofday() to check duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture system calls. In ns. Testing environment: CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz Kernel : v4.5.0 MEAN STDVAR BASE 800214.950 2853.083 PRE1 2253846.700 9997.014 PRE2 2257495.540 8516.293 POST 2250896.100 8933.921 Where 'BASE' is pure performance without capturing. 'PRE1' is test result of pure 'v4.5.0' kernel. 'PRE2' is test result before this patch. 'POST' is test result after this patch. See [4] for the detailed experimental setup. Considering the stdvar, this patch doesn't introduce performance overhead to the fast path. [1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html [2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html [3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html [4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com Signed-off-by: Wang Nan Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Brendan Gregg Cc: He Kuang Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Zefan Li Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com [ Fixed the changelog some more. ] Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 28 ++++++++++++++++--- include/uapi/linux/perf_event.h | 3 ++- kernel/events/core.c | 48 +++++++++++++++++++++++++++++---- kernel/events/ring_buffer.c | 16 ++++++++++- 4 files changed, 85 insertions(+), 10 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b8b195fbe7874..85749ae8cb5fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -834,14 +834,24 @@ extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); +extern void perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); +extern void perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); extern void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs); + struct perf_sample_data *data, + struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { - return (event->overflow_handler == perf_event_output); + if (likely(event->overflow_handler == perf_event_output_forward)) + return true; + if (unlikely(event->overflow_handler == perf_event_output_backward)) + return true; + return false; } extern void @@ -1051,8 +1061,20 @@ static inline bool has_aux(struct perf_event *event) return event->pmu->setup_aux; } +static inline bool is_write_backward(struct perf_event *event) +{ + return !!event->attr.write_backward; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); +extern int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); +extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); + extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a3c19034d5f8d..43fc8d2134724 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -340,7 +340,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + write_backward : 1, /* Write ring buffer from end to beginning */ + __reserved_1 : 36; union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 21ba024c9ed1b..eabeb2aec00f3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5694,9 +5694,13 @@ void perf_prepare_sample(struct perf_event_header *header, } } -void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@ -5706,7 +5710,7 @@ void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -5717,6 +5721,30 @@ void perf_event_output(struct perf_event *event, rcu_read_unlock(); } +void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@ -8153,8 +8181,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; } else { - event->overflow_handler = perf_event_output; + event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } @@ -8388,6 +8419,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->clock != event->clock) goto out; + /* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + /* * If both events generate aux data, they must be on the same PMU */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 60be55a640408..c49bab42dc574 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -230,10 +230,24 @@ __perf_output_begin(struct perf_output_handle *handle, return -ENOSPC; } +int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + +int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, true); +} + int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + + return __perf_output_begin(handle, event, size, + unlikely(is_write_backward(event))); } unsigned int perf_output_copy(struct perf_output_handle *handle, -- GitLab From dcee75b3b7f025cc6765e6c92ba0a4e59a4d25f4 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 17 Apr 2016 15:03:00 -0700 Subject: [PATCH 325/705] perf/x86/intel/rapl: Support Skylake RAPL domains Add Skylake client support for RAPL domains. In addition to RAPL domains in Broadwell clients, it has support for platform domain (aka PSys). The PSys domain controls the entire SoC instead of just a CPU package. Unlike package domain, PSys support requires more than just processor level implementation. The other parts in the system need additional HW level signaling, which OEMs need to support. When not supported, the energy counter register in PSys domain returns 0. Also corrected error in comment for GPU counter, which previously was DRAM counter. Signed-off-by: Srinivas Pandruvada Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Cc: hpa@zytor.com Cc: jacob.jun.pan@linux.intel.com Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1460930581-29748-2-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 54 ++++++++++++++++++++++++++++++-- arch/x86/include/asm/msr-index.h | 2 ++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index c9b7489ae8ee1..26c7d7d8a657e 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -27,10 +27,14 @@ * event: rapl_energy_dram * perf code: 0x3 * - * dram counter: consumption of the builtin-gpu domain (client only) + * gpu counter: consumption of the builtin-gpu domain (client only) * event: rapl_energy_gpu * perf code: 0x4 * + * psys counter: consumption of the builtin-psys domain (client only) + * event: rapl_energy_psys + * perf code: 0x5 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -66,13 +70,16 @@ MODULE_LICENSE("GPL"); #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ +#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */ +#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */ -#define NR_RAPL_DOMAINS 0x4 +#define NR_RAPL_DOMAINS 0x5 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", "dram", "pp1-gpu", + "psys", }; /* Clients have PP0, PKG */ @@ -91,6 +98,13 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1< Date: Thu, 21 Apr 2016 15:14:17 +0200 Subject: [PATCH 326/705] x86/perf/rapl: Reorder model numbers Re-order the model array to match the order in events/intel/core.c, to easier spot gaps and such. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 26c7d7d8a657e..1e7b1dfff1c72 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -787,17 +787,22 @@ static const struct intel_rapl_init_fun skl_rapl_init __initconst = { static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */ + X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */ + X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */ - X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */ - X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ + X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */ + X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */ + X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */ X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */ X86_RAPL_MODEL_MATCH(70, hsw_rapl_init), /* Haswell GT3e */ + X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */ X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */ - X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */ - X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */ + X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ + X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */ + X86_RAPL_MODEL_MATCH(78, skl_rapl_init), /* Skylake */ X86_RAPL_MODEL_MATCH(94, skl_rapl_init), /* Skylake H/S */ {}, -- GitLab From 31b84310c79421d726621e800434c66a48a6c959 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Apr 2016 15:15:47 +0200 Subject: [PATCH 327/705] x86/perf/rapl: Add missing Broadwell model With the array aligned as per events/intel/core.c it was fairly obvious we missed one, add it in. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 1e7b1dfff1c72..99c4bab123cda 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -800,6 +800,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */ X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */ X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ + X86_RAPL_MODEL_MATCH(86, hsx_rapl_init), /* Broadwell Xeon D */ X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */ -- GitLab From 1f621e028baf391f6684003e32e009bc934b750f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 6 Apr 2016 18:47:40 +0530 Subject: [PATCH 328/705] sched/fair: Fix asym packing to select correct CPU When asymmetric packing is set in the sched_domain and target CPU is busy, update_sd_pick_busiest() may not select the busiest runqueue. When target CPU is busy, find_busiest_group() will ignore checks for asym packing and may continue to load balance using the currently selected not-the-busiest runqueue as source runqueue. Selecting the busiest runqueue as source when the target CPU is busy, should result in achieving much better load balance. Also when target CPU is not busy and asymmetric packing is set in sd, select higher CPU as source CPU for load balancing. While doing this change, move the check to see if target CPU is busy into check_asym_packing(). The extent of performance benefit from this change decreases with the increasing load. However there is benefit in undercommit as well as overcommit conditions. 1. Record per second ebizzy (32 threads) on a 64 CPU power 7 box. (5 iterations) 4.6.0-rc2 Testcase: Min Max Avg StdDev ebizzy: 5223767.00 10368236.00 7946971.00 1753094.76 4.6.0-rc2+asym-changes Testcase: Min Max Avg StdDev %Change ebizzy: 8617191.00 13872356.00 11383980.00 1783400.89 +24.78% 2. Record per second ebizzy (64 threads) on a 64 CPU power 7 box. (5 iterations) 4.6.0-rc2 Testcase: Min Max Avg StdDev ebizzy: 6497666.00 18399783.00 10818093.20 4051452.08 4.6.0-rc2+asym-changes Testcase: Min Max Avg StdDev %Change ebizzy: 7567365.00 19456937.00 11674063.60 4295407.48 +4.40% 3. Record per second ebizzy (128 threads) on a 64 CPU power 7 box. (5 iterations) 4.6.0-rc2 Testcase: Min Max Avg StdDev ebizzy: 37073983.00 40341911.00 38776241.80 1259766.82 4.6.0-rc2+asym-changes Testcase: Min Max Avg StdDev %Change ebizzy: 38030399.00 41333378.00 39827404.40 1255001.86 +2.54% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Gautham R Shenoy Cc: Michael Neuling Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vaidyanathan Srinivasan Link: http://lkml.kernel.org/r/1459948660-16073-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8cc1c35cd7c1..6e371f43fc804 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6679,6 +6679,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!(env->sd->flags & SD_ASYM_PACKING)) return true; + /* No ASYM_PACKING if target cpu is already busy */ + if (env->idle == CPU_NOT_IDLE) + return true; /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups @@ -6688,7 +6691,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!sds->busiest) return true; - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + /* Prefer to move from highest possible cpu's work */ + if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) return true; } @@ -6834,6 +6838,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; + if (env->idle == CPU_NOT_IDLE) + return 0; + if (!sds->busiest) return 0; @@ -7026,8 +7033,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && - check_asym_packing(env, &sds)) + if (check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ -- GitLab From 21e96f88776deead303ecd30a17d1d7c2a1776e3 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Mon, 21 Mar 2016 17:21:07 -0700 Subject: [PATCH 329/705] sched/fair: Move cpufreq hook to update_cfs_rq_load_avg() The cpufreq hook should be called whenever the root cfs_rq utilization changes so update_cfs_rq_load_avg() is a better place for it. The current location is not invoked in the enqueue_entity() or update_blocked_averages() paths. Suggested-by: Vincent Guittot Signed-off-by: Steve Muckle Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Michael Turquette Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458606068-7476-1-git-send-email-smuckle@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e371f43fc804..6df80d47a5250 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2878,7 +2878,9 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; + struct rq *rq = rq_of(cfs_rq); int decayed, removed = 0; + int cpu = cpu_of(rq); if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2893,7 +2895,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + decayed = __update_load_avg(now, cpu, sa, scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); #ifndef CONFIG_64BIT @@ -2901,28 +2903,6 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed || removed; -} - -/* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - - /* - * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration - */ - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) - update_tg_load_avg(cfs_rq, 0); - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { unsigned long max = rq->cpu_capacity_orig; @@ -2943,8 +2923,30 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * See cpu_util(). */ cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); + min(sa->util_avg, max), max); } + + return decayed || removed; +} + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -- GitLab From 41e0d37f7ac81297c07ba311e4ad39465b8c8295 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Mon, 21 Mar 2016 17:21:08 -0700 Subject: [PATCH 330/705] sched/fair: Do not call cpufreq hook unless util changed There's no reason to call the cpufreq hook if the root cfs_rq utilization has not been modified. Signed-off-by: Steve Muckle Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Michael Turquette Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/1458606068-7476-2-git-send-email-smuckle@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6df80d47a5250..81552819444c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2879,20 +2879,21 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; struct rq *rq = rq_of(cfs_rq); - int decayed, removed = 0; + int decayed, removed_load = 0, removed_util = 0; int cpu = cpu_of(rq); if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); - removed = 1; + removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + removed_util = 1; } decayed = __update_load_avg(now, cpu, sa, @@ -2903,7 +2904,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq && + (decayed || removed_util)) { unsigned long max = rq->cpu_capacity_orig; /* @@ -2926,7 +2928,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) min(sa->util_avg, max), max); } - return decayed || removed; + return decayed || removed_load; } /* Update task and its cfs_rq load average */ -- GitLab From a2c6c91f98247fef0fe75216d607812485aeb0df Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 24 Mar 2016 15:26:07 -0700 Subject: [PATCH 331/705] sched/fair: Call cpufreq hook in additional paths The cpufreq hook should be called any time the root CFS rq utilization changes. This can occur when a task is switched to or from the fair class, or a task moves between groups or CPUs, but these paths currently do not call the cpufreq hook. Fix this by adding the hook to attach_entity_load_avg() and detach_entity_load_avg(). Suggested-by: Vincent Guittot Signed-off-by: Steve Muckle [ Added the .update_freq argument to update_cfs_rq_load_avg() to avoid a double cpufreq call. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Michael Turquette Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458858367-2831-1-git-send-email-smuckle@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 73 ++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81552819444c3..c328bd77fe355 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2874,13 +2874,41 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } +} + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - struct rq *rq = rq_of(cfs_rq); int decayed, removed_load = 0, removed_util = 0; - int cpu = cpu_of(rq); if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2896,7 +2924,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) removed_util = 1; } - decayed = __update_load_avg(now, cpu, sa, + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); #ifndef CONFIG_64BIT @@ -2904,29 +2932,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq && - (decayed || removed_util)) { - unsigned long max = rq->cpu_capacity_orig; - - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_clock(rq), - min(sa->util_avg, max), max); - } + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); return decayed || removed_load; } @@ -2947,7 +2954,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); } @@ -2976,6 +2983,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2988,6 +2997,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -3005,7 +3016,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -6213,7 +6224,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6274,7 +6285,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } -- GitLab From cee1afce3053e7aa0793fbd5f2e845fa2cef9e33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Apr 2016 15:56:50 +0200 Subject: [PATCH 332/705] sched/fair: Gather CPU load functions under a more conventional namespace The CPU load update related functions have a weak naming convention currently, starting with update_cpu_load_*() which isn't ideal as "update" is a very generic concept. Since two of these functions are public already (and a third is to come) that's enough to introduce a more conventional naming scheme. So let's do the following rename instead: update_cpu_load_*() -> cpu_load_update_*() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Chris Metcalf Cc: Christoph Lameter Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460555812-25375-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 10 +++++----- include/linux/sched.h | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 24 ++++++++++++------------ kernel/sched/sched.h | 4 ++-- kernel/time/tick-sched.c | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index f52f297cb4062..9857606dd7b71 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set. -0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit -0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit -0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit - -0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit - -0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz + -0 3dN.1 13us : cpu_load_update_nohz <-tick_nohz_idle_exit + -0 3dN.1 13us : _raw_spin_lock <-cpu_load_update_nohz -0 3dN.1 13us : add_preempt_count <-_raw_spin_lock - -0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz - -0 3dN.2 14us : sched_avg_update <-__update_cpu_load - -0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz + -0 3dN.2 13us : __cpu_load_update <-cpu_load_update_nohz + -0 3dN.2 14us : sched_avg_update <-__cpu_load_update + -0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz -0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock -0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit -0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit diff --git a/include/linux/sched.h b/include/linux/sched.h index 13c1c1d07270a..0b7f6028a50b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(int active); +extern void cpu_load_update_nohz(int active); #else -static inline void update_cpu_load_nohz(int active) { } +static inline void cpu_load_update_nohz(int active) { } #endif extern void dump_cpu_task(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06efbb9c95441..c98a2688f3909 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2917,7 +2917,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c328bd77fe355..ecd81c4ebb56a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,7 +4559,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } /** - * __update_cpu_load - update the rq->cpu_load[] statistics + * __cpu_load_update - update the rq->cpu_load[] statistics * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates @@ -4594,7 +4594,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra * term. See the @active paramter. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +static void __cpu_load_update(struct rq *this_rq, unsigned long this_load, unsigned long pending_updates, int active) { unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; @@ -4642,7 +4642,7 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON -static void __update_cpu_load_nohz(struct rq *this_rq, +static void __cpu_load_update_nohz(struct rq *this_rq, unsigned long curr_jiffies, unsigned long load, int active) @@ -4657,7 +4657,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq, * In the NOHZ_FULL case, we were non-idle, we should consider * its weighted load. */ - __update_cpu_load(this_rq, load, pending_updates, active); + __cpu_load_update(this_rq, load, pending_updates, active); } } @@ -4678,7 +4678,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq, * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_cpu_load_idle(struct rq *this_rq) +static void cpu_load_update_idle(struct rq *this_rq) { /* * bail if there's load or we're actually up-to-date. @@ -4686,13 +4686,13 @@ static void update_cpu_load_idle(struct rq *this_rq) if (weighted_cpuload(cpu_of(this_rq))) return; - __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); + __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(int active) +void cpu_load_update_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); @@ -4702,7 +4702,7 @@ void update_cpu_load_nohz(int active) return; raw_spin_lock(&this_rq->lock); - __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); + __cpu_load_update_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4710,14 +4710,14 @@ void update_cpu_load_nohz(int active) /* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). + * See the mess around cpu_load_update_idle() / cpu_load_update_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1, 1); + __cpu_load_update(this_rq, load, 1, 1); } /* @@ -8031,7 +8031,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_cpu_load_idle(rq); + cpu_load_update_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a7cbad7b3ad28..32d9e22cfacfc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP -extern void update_cpu_load_active(struct rq *this_rq); +extern void cpu_load_update_active(struct rq *this_rq); #else -static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void cpu_load_update_active(struct rq *this_rq) { } #endif /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58e3310c9b213..66bdc9acc2831 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -806,7 +806,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(active); + cpu_load_update_nohz(active); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); -- GitLab From 1f41906a6fda1114debd3898668bd7ab6470ee41 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Apr 2016 15:56:51 +0200 Subject: [PATCH 333/705] sched/fair: Correctly handle nohz ticks CPU load accounting Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Chris Metcalf Cc: Christoph Lameter Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++- kernel/sched/fair.c | 97 ++++++++++++++++++++++++++-------------- kernel/time/tick-sched.c | 9 ++-- 3 files changed, 72 insertions(+), 40 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0b7f6028a50b8..d894f2d61388a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void cpu_load_update_nohz(int active); +extern void cpu_load_update_nohz_start(void); +extern void cpu_load_update_nohz_stop(void); #else -static inline void cpu_load_update_nohz(int active) { } +static inline void cpu_load_update_nohz_start(void) { } +static inline void cpu_load_update_nohz_stop(void) { } #endif extern void dump_cpu_task(int cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecd81c4ebb56a..b70367a3e1efd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4563,7 +4563,6 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates - * @active: !0 for NOHZ_FULL * * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -4592,12 +4591,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * load[i]_n = (1 - 1/2^i)^n * load[i]_0 * * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. See the @active paramter. + * term. */ -static void __cpu_load_update(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates, int active) +static void cpu_load_update(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; + unsigned long tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4642,10 +4641,23 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON -static void __cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load, - int active) +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we need to avoid the delta approach from the regular tick when + * possible since that would seriously skew the load calculation. This is why we + * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on + * jiffies deltas for updates happening while in nohz mode (idle ticks, idle + * loop exit, nohz_idle_balance, nohz full exit...) + * + * This means we might still be one tick off for nohz periods. + */ + +static void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { unsigned long pending_updates; @@ -4657,23 +4669,10 @@ static void __cpu_load_update_nohz(struct rq *this_rq, * In the NOHZ_FULL case, we were non-idle, we should consider * its weighted load. */ - __cpu_load_update(this_rq, load, pending_updates, active); + cpu_load_update(this_rq, load, pending_updates); } } -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. @@ -4686,26 +4685,56 @@ static void cpu_load_update_idle(struct rq *this_rq) if (weighted_cpuload(cpu_of(this_rq))) return; - __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0); + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); } /* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Record CPU load on nohz entry so we know the tickless load to account + * on nohz exit. cpu_load[0] happens then to be updated more frequently + * than other cpu_load[idx] but it should be fine as cpu_load readers + * shouldn't rely into synchronized cpu_load[*] updates. */ -void cpu_load_update_nohz(int active) +void cpu_load_update_nohz_start(void) { struct rq *this_rq = this_rq(); + + /* + * This is all lockless but should be fine. If weighted_cpuload changes + * concurrently we'll exit nohz. And cpu_load write can race with + * cpu_load_update_idle() but both updater would be writing the same. + */ + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +} + +/* + * Account the tickless load in the end of a nohz frame. + */ +void cpu_load_update_nohz_stop(void) +{ unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; + struct rq *this_rq = this_rq(); + unsigned long load; if (curr_jiffies == this_rq->last_load_update_tick) return; + load = weighted_cpuload(cpu_of(this_rq)); raw_spin_lock(&this_rq->lock); - __cpu_load_update_nohz(this_rq, curr_jiffies, load, active); + cpu_load_update_nohz(this_rq, curr_jiffies, load); raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) +{ + /* See the mess around cpu_load_update_nohz(). */ + this_rq->last_load_update_tick = READ_ONCE(jiffies); + cpu_load_update(this_rq, load, 1); +} /* * Called from scheduler_tick() @@ -4713,11 +4742,11 @@ void cpu_load_update_nohz(int active) void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); - /* - * See the mess around cpu_load_update_idle() / cpu_load_update_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __cpu_load_update(this_rq, load, 1, 1); + + if (tick_nohz_tick_stopped()) + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); + else + cpu_load_update_periodic(this_rq, load); } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 66bdc9acc2831..31872bc53bc45 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); calc_load_enter_idle(); + cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -802,11 +803,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - cpu_load_update_nohz(active); + cpu_load_update_nohz_stop(); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); @@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get(), 1); + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now, 0); + tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } -- GitLab From 9fd81dd5ce0b12341c9f83346f8d32ac68bd3841 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Apr 2016 17:36:51 +0200 Subject: [PATCH 334/705] sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates Some code in CPU load update only concern NO_HZ configs but it is built on all configurations. When NO_HZ isn't built, that code is harmless but just happens to take some useless ressources in CPU and memory: 1) one useless field in struct rq 2) jiffies record on every tick that is never used (cpu_load_update_periodic) 3) decay_load_missed is called two times on every tick to eventually return immediately with no action taken. And that function is dead code. For pure optimization purposes, lets conditionally build the NO_HZ related code. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Chris Metcalf Cc: Christoph Lameter Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1461080211-16271-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ++--- kernel/sched/fair.c | 9 +++++++-- kernel/sched/sched.h | 6 ++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c98a2688f3909..71dffbb27ce61 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7381,8 +7381,6 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7401,12 +7399,13 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b70367a3e1efd..b8a33abce6505 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4491,7 +4491,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - +#ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. */ @@ -4557,6 +4557,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } +#endif /* CONFIG_NO_HZ_COMMON */ /** * __cpu_load_update - update the rq->cpu_load[] statistics @@ -4596,7 +4597,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) static void cpu_load_update(struct rq *this_rq, unsigned long this_load, unsigned long pending_updates) { - unsigned long tickless_load = this_rq->cpu_load[0]; + unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4609,6 +4610,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; +#ifdef CONFIG_NO_HZ_COMMON old_load = decay_load_missed(old_load, pending_updates - 1, i); if (tickless_load) { old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); @@ -4619,6 +4621,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, */ old_load += tickless_load; } +#endif new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4731,8 +4734,10 @@ static inline void cpu_load_update_nohz(struct rq *this_rq, static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) { +#ifdef CONFIG_NO_HZ_COMMON /* See the mess around cpu_load_update_nohz(). */ this_rq->last_load_update_tick = READ_ONCE(jiffies); +#endif cpu_load_update(this_rq, load, 1); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 32d9e22cfacfc..69da6fcaa0e87 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -585,11 +585,13 @@ struct rq { #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ u64 nohz_stamp; unsigned long nohz_flags; -#endif +#endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif -- GitLab From fec148c000d0f9ac21679601722811eb60b4cc52 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 14 Apr 2016 20:19:28 +0800 Subject: [PATCH 335/705] sched/deadline: Fix a bug in dl_overflow() I got a minus(very big) dl_b->total_bw during my deadline tests. # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -222297900 Something unusual must have happened. After some digging, I finally noticed that when changing a deadline task to normal(cfs), and changing it back to deadline immediately, after it died, we will got the wrong dl_bw->total_bw. The root cause is in dl_overflow(), it has: if (new_bw == p->dl.dl_bw) return 0; 1) When a deadline task is changed to !deadline task, it will start dl timer in switched_from_dl(), and retain previous deadline parameter till the timer expires. 2) If we change it back to deadline with the same bandwidth parameter before the timer expires, as it keeps the old bandwidth although it is not a deadline task. dl_overflow() simply returns success without updating the right data, and got the wrong dl_bw->total_bw. The solution is simple, if @p is not deadline, don't return. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1460636368-1993-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71dffbb27ce61..9d84d6004745c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2378,7 +2378,8 @@ static int dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* -- GitLab From 70a2cba972e5e4a5d850e4179381f1cd344c6828 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 19 Apr 2016 11:17:27 +0300 Subject: [PATCH 336/705] perf buildid: Fix off-by-one in write_buildid() write_buildid() increments 'name_len' with intention to take into account trailing zero byte. However, 'name_len' was already incremented in machine__write_buildid_table() before. So this leads to out-of-bounds read in do_write(): $ ./perf record sleep 0 [ perf record: Woken up 1 times to write data ] ================================================================= ==15899==ERROR: AddressSanitizer: global-buffer-overflow on address 0x00000099fc92 at pc 0x7f1aa9c7eab5 bp 0x7fff940f84d0 sp 0x7fff940f7c78 READ of size 19 at 0x00000099fc92 thread T0 #0 0x7f1aa9c7eab4 (/usr/lib/gcc/x86_64-pc-linux-gnu/5.3.0/libasan.so.2+0x44ab4) #1 0x649c5b in do_write util/header.c:67 #2 0x649c5b in write_padded util/header.c:82 #3 0x57e8bc in write_buildid util/build-id.c:239 #4 0x57e8bc in machine__write_buildid_table util/build-id.c:278 ... 0x00000099fc92 is located 0 bytes to the right of global variable '*.LC99' defined in 'util/symbol.c' (0x99fc80) of size 18 '*.LC99' is ascii string '[kernel.kallsyms]' ... Shadow bytes around the buggy address: 0x00008012bf80: f9 f9 f9 f9 00 00 00 00 00 00 03 f9 f9 f9 f9 f9 =>0x00008012bf90: 00 00[02]f9 f9 f9 f9 f9 00 00 00 00 00 05 f9 f9 0x00008012bfa0: f9 f9 f9 f9 00 03 f9 f9 f9 f9 f9 f9 00 00 00 00 Signed-off-by: Andrey Ryabinin Cc: Alexander Shishkin Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1461053847-5633-1-git-send-email-aryabinin@virtuozzo.com [ Remove the off-by one at the origin, to keep len(s) == strlen(s) assumption ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/build-id.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 0573c2ec861d9..b6ecf87bc3e3c 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -261,14 +261,14 @@ static int machine__write_buildid_table(struct machine *machine, int fd) if (dso__is_vdso(pos)) { name = pos->short_name; - name_len = pos->short_name_len + 1; + name_len = pos->short_name_len; } else if (dso__is_kcore(pos)) { machine__mmap_name(machine, nm, sizeof(nm)); name = nm; - name_len = strlen(nm) + 1; + name_len = strlen(nm); } else { name = pos->long_name; - name_len = pos->long_name_len + 1; + name_len = pos->long_name_len; } in_kernel = pos->kernel || -- GitLab From 0ae537cb35e63f6a61013e736a0557b83a0336ea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 16:00:01 -0300 Subject: [PATCH 337/705] perf trace: Extract evsel contructor from perf_evlist__add_pgfault Prep work for next patches, where we'll need access to the created evsels, to possibly configure callchains. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-2pcgsgnkgellhlcao4aub8tu@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5e2614bbb48da..69b4603542013 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2381,8 +2381,7 @@ static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) return true; } -static int perf_evlist__add_pgfault(struct perf_evlist *evlist, - u64 config) +static struct perf_evsel *perf_evsel__new_pgfault(u64 config) { struct perf_evsel *evsel; struct perf_event_attr attr = { @@ -2396,13 +2395,10 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist, event_attr_init(&attr); evsel = perf_evsel__new(&attr); - if (!evsel) - return -ENOMEM; - - evsel->handler = trace__pgfault; - perf_evlist__add(evlist, evsel); + if (evsel) + evsel->handler = trace__pgfault; - return 0; + return evsel; } static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) @@ -2504,7 +2500,7 @@ static int trace__set_ev_qualifier_filter(struct trace *trace) static int trace__run(struct trace *trace, int argc, const char **argv) { struct perf_evlist *evlist = trace->evlist; - struct perf_evsel *evsel; + struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; int err = -1, i; unsigned long before; const bool forks = argc > 0; @@ -2518,14 +2514,19 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (trace->trace_syscalls) trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); - if ((trace->trace_pgfaults & TRACE_PFMAJ) && - perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) { - goto out_error_mem; + if ((trace->trace_pgfaults & TRACE_PFMAJ)) { + pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); + if (pgfault_maj == NULL) + goto out_error_mem; + perf_evlist__add(evlist, pgfault_maj); } - if ((trace->trace_pgfaults & TRACE_PFMIN) && - perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN)) - goto out_error_mem; + if ((trace->trace_pgfaults & TRACE_PFMIN)) { + pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); + if (pgfault_min == NULL) + goto out_error_mem; + perf_evlist__add(evlist, pgfault_min); + } if (trace->sched && perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", -- GitLab From 0c3a6ef4ea54a179328734a45b7f7698e44ad805 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Apr 2016 16:31:12 -0300 Subject: [PATCH 338/705] perf trace: Make --pf maj/min/all use callchains too Forgot about page faults, a software event, when adding support for callchains, fix it: # trace --no-syscalls --pf maj --call dwarf 0.000 ( 0.000 ms): Xorg/2068 majfault [sfbSegment1+0x0] => /usr/lib64/xorg/modules/drivers/intel_drv.so@0x11b490 (x.) sfbSegment1+0x0 (/usr/lib64/xorg/modules/drivers/intel_drv.so) fbPolySegment32+0x361 (/usr/lib64/xorg/modules/drivers/intel_drv.so) sna_poly_segment+0x743 (/usr/lib64/xorg/modules/drivers/intel_drv.so) damagePolySegment+0x77 (/usr/libexec/Xorg) ProcPolySegment+0xe7 (/usr/libexec/Xorg) Dispatch+0x25f (/usr/libexec/Xorg) dix_main+0x3c3 (/usr/libexec/Xorg) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) _start+0x29 (/usr/libexec/Xorg) 0.257 ( 0.000 ms): Xorg/2068 majfault [miZeroClipLine+0x0] => /usr/libexec/Xorg@0x18e830 (x.) miZeroClipLine+0x0 (/usr/libexec/Xorg) _fbSegment+0x2c0 (/usr/lib64/xorg/modules/drivers/intel_drv.so) sfbSegment1+0x67 (/usr/lib64/xorg/modules/drivers/intel_drv.so) fbPolySegment32+0x361 (/usr/lib64/xorg/modules/drivers/intel_drv.so) sna_poly_segment+0x743 (/usr/lib64/xorg/modules/drivers/intel_drv.so) damagePolySegment+0x77 (/usr/libexec/Xorg) ProcPolySegment+0xe7 (/usr/libexec/Xorg) Dispatch+0x25f (/usr/libexec/Xorg) dix_main+0x3c3 (/usr/libexec/Xorg) __libc_start_main+0xf0 (/usr/lib64/libc-2.22.so) _start+0x29 (/usr/libexec/Xorg) ^C# Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-8h6ssirw5z15qyhy2lwd6f89@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 59 ++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 69b4603542013..d1bbcb9abca3e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2222,6 +2222,11 @@ static int trace__pgfault(struct trace *trace, print_location(trace->output, sample, &al, true, false); fprintf(trace->output, " (%c%c)\n", map_type, al.level); + + if (sample->callchain) { + if (trace__resolve_callchain(trace, evsel, sample, &callchain_cursor) == 0) + trace__fprintf_callchain(trace, sample); + } out: err = 0; out_put: @@ -2547,24 +2552,42 @@ static int trace__run(struct trace *trace, int argc, const char **argv) perf_evlist__config(evlist, &trace->opts, NULL); - if (callchain_param.enabled && trace->syscalls.events.sys_exit) { - perf_evsel__config_callchain(trace->syscalls.events.sys_exit, - &trace->opts, &callchain_param); - /* - * Now we have evsels with different sample_ids, use - * PERF_SAMPLE_IDENTIFIER to map from sample to evsel - * from a fixed position in each ring buffer record. - * - * As of this the changeset introducing this comment, this - * isn't strictly needed, as the fields that can come before - * PERF_SAMPLE_ID are all used, but we'll probably disable - * some of those for things like copying the payload of - * pointer syscall arguments, and for vfs_getname we don't - * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this - * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. - */ - perf_evlist__set_sample_bit(evlist, IDENTIFIER); - perf_evlist__reset_sample_bit(evlist, ID); + if (callchain_param.enabled) { + bool use_identifier = false; + + if (trace->syscalls.events.sys_exit) { + perf_evsel__config_callchain(trace->syscalls.events.sys_exit, + &trace->opts, &callchain_param); + use_identifier = true; + } + + if (pgfault_maj) { + perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); + use_identifier = true; + } + + if (pgfault_min) { + perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); + use_identifier = true; + } + + if (use_identifier) { + /* + * Now we have evsels with different sample_ids, use + * PERF_SAMPLE_IDENTIFIER to map from sample to evsel + * from a fixed position in each ring buffer record. + * + * As of this the changeset introducing this comment, this + * isn't strictly needed, as the fields that can come before + * PERF_SAMPLE_ID are all used, but we'll probably disable + * some of those for things like copying the payload of + * pointer syscall arguments, and for vfs_getname we don't + * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this + * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. + */ + perf_evlist__set_sample_bit(evlist, IDENTIFIER); + perf_evlist__reset_sample_bit(evlist, ID); + } } signal(SIGCHLD, sig_handler); -- GitLab From e557b674a9470dae99916be6105e6780b3a072ca Mon Sep 17 00:00:00 2001 From: Chris Phlipot Date: Tue, 19 Apr 2016 19:32:11 -0700 Subject: [PATCH 339/705] perf script: Fix segfault when printing callchains This fixes a bug caused by an unitialized callchain cursor. The crash frist appeared in: 6f736735e30f ("perf evsel: Require that callchains be resolved before calling fprintf_{sym,callchain}") The callchain cursor is a struct that contains pointers, that when uninitialized will cause unpredictable behavior (usually a crash) when trying to append to the callchain. The existing implementation has the following issues: 1. The callchain cursor used is not initialized, resulting in unpredictable behavior when used. 2. The cursor is declared on the stack. Even if it is properly initalized, the implmentation will leak memory when the function returns, since all the references to the callchain_nodes allocated by callchain_cursor_append will be lost when the cursor goes out of scope. 3. Storing the cursor on the stack is inefficient. Even if memory is properly freed when it goes out of scope, a performance penalty will be incurred due to reallocation of callchain nodes. callchain_cursor_append is designed to avoid these reallocations when an existing cursor is reused. This patch fixes the crash by replacing cursor_callchain with a reference to the global callchain_cursor which also resolves all 3 issues mentioned above. How to reproduce the crash: $ perf record --call-graph=dwarf stress -t 1 -c 1 $ perf script > /dev/null Segfault Signed-off-by: Chris Phlipot Tested-by: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Fixes: 6f736735e30f ("perf evsel: Require that callchains be resolved before calling fprintf_{sym,callchain}") Link: http://lkml.kernel.org/r/1461119531-2529-1-git-send-email-cphlipot0@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 5099740aa50bc..f43b0c6f88f45 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -570,12 +570,12 @@ static void print_sample_bts(struct perf_sample *sample, /* print branch_from information */ if (PRINT_FIELD(IP)) { unsigned int print_opts = output[attr->type].print_ip_opts; - struct callchain_cursor *cursor = NULL, cursor_callchain; + struct callchain_cursor *cursor = NULL; if (symbol_conf.use_callchain && sample->callchain && - thread__resolve_callchain(al->thread, &cursor_callchain, evsel, + thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) == 0) - cursor = &cursor_callchain; + cursor = &callchain_cursor; if (cursor == NULL) { putchar(' '); @@ -789,12 +789,12 @@ static void process_event(struct perf_script *script, printf("%16" PRIu64, sample->weight); if (PRINT_FIELD(IP)) { - struct callchain_cursor *cursor = NULL, cursor_callchain; + struct callchain_cursor *cursor = NULL; if (symbol_conf.use_callchain && sample->callchain && - thread__resolve_callchain(al->thread, &cursor_callchain, evsel, + thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) == 0) - cursor = &cursor_callchain; + cursor = &callchain_cursor; putchar(cursor ? '\n' : ' '); sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout); -- GitLab From 7ad356159542e1f0dd4703ff3604f67390657f57 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Apr 2016 19:55:48 -0300 Subject: [PATCH 340/705] perf trace: Make --event honour --min-stack too Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-shj0fazntmskhjild5i6x73l@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d1bbcb9abca3e..fc276d7181720 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2126,6 +2126,17 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { + int callchain_ret = 0; + + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out; + callchain_ret = 1; + } + } + trace__printf_interrupted_entry(trace, sample); trace__fprintf_tstamp(trace, sample->time, trace->output); @@ -2144,11 +2155,11 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, fprintf(trace->output, ")\n"); - if (sample->callchain) { - if (trace__resolve_callchain(trace, evsel, sample, &callchain_cursor) == 0) - trace__fprintf_callchain(trace, sample); - } - + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); +out: return 0; } -- GitLab From 1df54290463e84b7b5eb26e5e6472167c3749901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Apr 2016 20:06:02 -0300 Subject: [PATCH 341/705] perf trace: Make --pf honour --min-stack too To check deeply nested page fault callchains. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wuji34xx003kr88nmqt6jkgf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index fc276d7181720..a4b133fac82ba 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2190,8 +2190,19 @@ static int trace__pgfault(struct trace *trace, char map_type = 'd'; struct thread_trace *ttrace; int err = -1; + int callchain_ret = 0; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); + + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out_put; + callchain_ret = 1; + } + } + ttrace = thread__trace(thread, trace->output); if (ttrace == NULL) goto out_put; @@ -2234,10 +2245,10 @@ static int trace__pgfault(struct trace *trace, fprintf(trace->output, " (%c%c)\n", map_type, al.level); - if (sample->callchain) { - if (trace__resolve_callchain(trace, evsel, sample, &callchain_cursor) == 0) - trace__fprintf_callchain(trace, sample); - } + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: err = 0; out_put: -- GitLab From a213b92e15cc5019156594c8f3ae9170915aac9f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 25 Apr 2016 16:45:29 -0300 Subject: [PATCH 342/705] perf evlist: Decode perf_event_attr->branch_sample_type While trying to use --call-graph lbr in 'perf trace', since we only are interested in the callchain for userspace, up to the callchain, I found that 'perf evlist' is not decoding the branch_sample_type field, fix it. Before: # perf record --call-graph lbr usleep 1 # perf evlist -v cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, branch_sample_type: 51201 ^^^^^^^^^^^^^^^^^^^^^^^^^ After: # perf evlist -v cycles:ppp: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-hozai7974u0ulgx13k96fcaw@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 545bb3f0b2b06..334364e25bbe8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1231,6 +1231,21 @@ static void __p_sample_type(char *buf, size_t size, u64 value) __p_bits(buf, size, value, bits); } +static void __p_branch_sample_type(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n } + struct bit_names bits[] = { + bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY), + bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL), + bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), + bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), + bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} + static void __p_read_format(char *buf, size_t size, u64 value) { #define bit_name(n) { PERF_FORMAT_##n, #n } @@ -1249,6 +1264,7 @@ static void __p_read_format(char *buf, size_t size, u64 value) #define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val)) #define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val)) #define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) +#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) #define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) #define PRINT_ATTRn(_n, _f, _p) \ @@ -1305,7 +1321,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, PRINT_ATTRf(bp_type, p_unsigned); PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex); PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex); - PRINT_ATTRf(branch_sample_type, p_unsigned); + PRINT_ATTRf(branch_sample_type, p_branch_sample_type); PRINT_ATTRf(sample_regs_user, p_hex); PRINT_ATTRf(sample_stack_user, p_unsigned); PRINT_ATTRf(clockid, p_signed); -- GitLab From 6404436a63a463d03ef9b5d7cd5edd371e711a95 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 25 Apr 2016 22:17:17 +0200 Subject: [PATCH 343/705] perf tools: Make the x86 clean quiet Turn current clean output: $ make clean rm -f arch/x86/include/generated/asm/syscalls_64.c CLEAN libbpf CLEAN libapi into: $ make clean CLEAN x86 CLEAN libapi CLEAN libbpf Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: TJ Link: http://lkml.kernel.org/r/1461615438-27894-1-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index a33729173b134..6c9211b18ec09 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -24,6 +24,6 @@ $(header): $(sys)/syscall_64.tbl $(systbl) $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@ clean:: - rm -f $(header) + $(call QUIET_CLEAN, x86) $(RM) $(header) archheaders: $(header) -- GitLab From ab362f5a95ff72a895c446e9d5ef548cac2fea07 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 25 Apr 2016 22:17:18 +0200 Subject: [PATCH 344/705] tools build: Fix perf_clean target Fix perf_clean target to follow the same logic as perf target. Fixes the following make invokation: $ cd && make tools/perf_clean Reported-by: TJ Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=116411 Link: http://lkml.kernel.org/r/1461615438-27894-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/Makefile b/tools/Makefile index 60c7e6c8ff178..6bf68fe7dd290 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -137,7 +137,8 @@ libsubcmd_clean: $(call descend,lib/subcmd,clean) perf_clean: - $(call descend,$(@:_clean=),clean) + $(Q)mkdir -p $(PERF_O) . + $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean selftests_clean: $(call descend,testing/$(@:_clean=),clean) -- GitLab From 3b556bced46aa6b1873da7faa18eff235e896adc Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Mon, 25 Apr 2016 10:47:54 +0100 Subject: [PATCH 345/705] perf tools: Remove duplicate const qualifier Signed-off-by: Eric Engestrom Cc: Adrian Hunter Cc: David Ahern Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1461577678-29517-1-git-send-email-eric.engestrom@imgtec.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index dfd00c6dad6e6..de2036d1251b9 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -233,7 +233,7 @@ void thread__find_cpumode_addr_location(struct thread *thread, struct addr_location *al) { size_t i; - const u8 const cpumodes[] = { + const u8 cpumodes[] = { PERF_RECORD_MISC_USER, PERF_RECORD_MISC_KERNEL, PERF_RECORD_MISC_GUEST_USER, -- GitLab From 8daef508b0a144970e5cbc587525c351663fec63 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 23 Apr 2016 14:45:54 +0100 Subject: [PATCH 346/705] perf tests: Replace assignment with comparison on assert check The current assert check is checking an assignment, which will always be true. Instead, the assert should be checking if scale is equal to 0.122 Signed-off-by: Colin Ian King Reviewed-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1461419154-16918-1-git-send-email-colin.king@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/event_update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 012eab5d1df11..63ecf21750eb0 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -30,7 +30,7 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE); - TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123); + TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123); return 0; } -- GitLab From 73b1794e252b0476cc6e46461c7612cbaa88be45 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 20 Apr 2016 20:14:07 -0700 Subject: [PATCH 347/705] perf bench futex: Simplify wrapper for LOCK_PI Given that the 'val' parameter is ignored for FUTEX_LOCK_PI, get rid of the bogus deadlock detection flag in the wrapper code and avoid the extra argument, making it resemble its unlock counterpart. And if nothing else, we already only pass 0 anyway. Signed-off-by: Davidlohr Bueso Cc: Davidlohr Bueso Link: http://lkml.kernel.org/r/1461208447-29328-1-git-send-email-dave@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-lock-pi.c | 2 +- tools/perf/bench/futex.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 6a18ce21f8659..6952db65508ab 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -83,7 +83,7 @@ static void *workerfn(void *arg) do { int ret; again: - ret = futex_lock_pi(w->futex, NULL, 0, futex_flag); + ret = futex_lock_pi(w->futex, NULL, futex_flag); if (ret) { /* handle lock acquisition */ if (!silent) diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index d44de9f44281b..b2e06d1190d07 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -57,13 +57,11 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) /** * futex_lock_pi() - block on uaddr as a PI mutex - * @detect: whether (1) or not (0) to perform deadlock detection */ static inline int -futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect, - int opflags) +futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags) { - return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags); + return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags); } /** -- GitLab From c0664893050cc6b2d8b02d3e035f82fbfd0cd4cf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 24 Apr 2016 19:56:43 +0100 Subject: [PATCH 348/705] perf intel-pt: Fix off-by-one comparison on maximum code The check for the maximum code is off-by-one; the current comparison of a code that is INTEL_PT_ERR_MAX will cause the strlcpy to perform an out of bounds array access on the intel_pt_err_msgs array. Fix this with a >= comparison. Signed-off-by: Colin Ian King Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1461524203-10224-1-git-send-email-colin.king@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 9409d014b46c7..9c8f15da86ce8 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -356,7 +356,7 @@ static const char *intel_pt_err_msgs[] = { int intel_pt__strerror(int code, char *buf, size_t buflen) { - if (code < 1 || code > INTEL_PT_ERR_MAX) + if (code < 1 || code >= INTEL_PT_ERR_MAX) code = INTEL_PT_ERR_UNK; strlcpy(buf, intel_pt_err_msgs[code], buflen); return 0; -- GitLab From 09623d79466e996f5dc2753e16f04fda6f078041 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 24 Apr 2016 23:28:09 -0700 Subject: [PATCH 349/705] perf hists: Clear dummy entry accumulated period The accumulated period for dummy entry should also be 0. Otherwise, the total overhead could be overcounted. $ perf record -e '{LLC-load-misses,cpu/instructions/}' --call-graph=lbr ./tchain $ perf report --stdio # To display the perf.data header info, please use --header/--header-only options. # # Total Lost Samples: 0 # # Samples: 21K of event 'anon group { LLC-load-misses, cpu/instructions/ }' # Event count (approx.): 16313667937 # # Children Self Command Shared Object Symbol # ................ ................ ........... ................ ............................ # 4769.98% 0.01% 0.00% 0.01% tchain_edit [kernel.vmlinux] [k] update_fast_timekeeper 4356.18% 0.01% 0.00% 0.01% tchain_edit [kernel.vmlinux] [k] trigger_load_balance 3181.12% 0.01% 0.00% 0.01% tchain_edit [kernel.vmlinux] [k] irq_work_tick 1592.37% 0.00% 0.00% 0.00% tchain_edit [kernel.vmlinux] [k] cpu_needs_another_gp Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1461565689-5862-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 991a351a8a41d..0f33d7e698c4e 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2062,6 +2062,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists, if (he) { memset(&he->stat, 0, sizeof(he->stat)); he->hists = hists; + if (symbol_conf.cumulate_callchain) + memset(he->stat_acc, 0, sizeof(he->stat)); rb_link_node(&he->rb_node_in, parent, p); rb_insert_color(&he->rb_node_in, root); hists__inc_stats(hists, he); -- GitLab From b04b7023751bf6519eee64467b6477f0e7fb82a1 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 26 Apr 2016 02:28:54 +0000 Subject: [PATCH 350/705] perf evlist: Enforce ring buffer reading Don't read broken data after 'head' pointer. Following commits will feed perf_evlist__mmap_read() with some 'head' pointers not maintained by kernel. If 'head' pointer breaks an event, we should avoid reading from the broken event. This can happen in backward ring buffer. For example: old head | | V V +---+------+----------+----+-----+--+ |..E|D....D|C........C|B..B|A....|E.| +---+------+----------+----+-----+--+ 'old' pointer points to the beginning of 'A' and trying read from it, but 'A' has been overwritten. In this case, don't try to read from 'A', simply return NULL. Signed-off-by: Wang Nan Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1461637738-62722-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6fb5725821de7..85271e54a63b3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -684,6 +684,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) struct perf_mmap *md = &evlist->mmap[idx]; u64 head; u64 old = md->prev; + int diff; unsigned char *data = md->base + page_size; union perf_event *event = NULL; @@ -694,6 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) return NULL; head = perf_mmap__read_head(md); + diff = head - old; if (evlist->overwrite) { /* * If we're further behind than half the buffer, there's a chance @@ -703,7 +705,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) * * In either case, truncate and restart at head. */ - int diff = head - old; if (diff > md->mask / 2 || diff < 0) { fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); @@ -711,15 +712,21 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) * head points to a known good entry, start there. */ old = head; + diff = 0; } } - if (old != head) { + if (diff >= (int)sizeof(event->header)) { size_t size; event = (union perf_event *)&data[old & md->mask]; size = event->header.size; + if (size < sizeof(event->header) || diff < (int)size) { + event = NULL; + goto broken_event; + } + /* * Event straddles the mmap boundary -- header should always * be inside due to u64 alignment of output. @@ -743,6 +750,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) old += size; } +broken_event: md->prev = old; return event; -- GitLab From 062d6c2aec0e087be956494a73221c04eca115fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 26 Apr 2016 15:47:37 +0900 Subject: [PATCH 351/705] perf probe: Close target file on error path Fix a bug to close target elf file in get_text_start_address(). Signed-off-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20160426064737.1443.44093.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8319fbb086361..97b7f8e5fe69e 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -486,8 +486,10 @@ static int get_text_start_address(const char *exec, unsigned long *address) return -errno; elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); - if (elf == NULL) - return -EINVAL; + if (elf == NULL) { + ret = -EINVAL; + goto out_close; + } if (gelf_getehdr(elf, &ehdr) == NULL) goto out; @@ -499,6 +501,9 @@ static int get_text_start_address(const char *exec, unsigned long *address) ret = 0; out: elf_end(elf); +out_close: + close(fd); + return ret; } -- GitLab From 8d4d5c3a7c25e69075e60e5e70c1e05c205aef89 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 26 Apr 2016 16:00:51 +0200 Subject: [PATCH 352/705] regulator: axp20x: Fix LDO4 linear voltage range The current linear voltage range for the LDO4 regulator found in the APX20X PMICs assumes that the voltage is linear between 2.5 and 3.1V. However, the PMIC can output up to 3.3V on that regulator by skipping the 2.6V and 2.9V steps. Fix the ranges to read and set the proper voltages. Fixes: 13d57e64352a ("regulator: axp20x: Use linear voltage ranges for AXP20X LDO4") Signed-off-by: Maxime Ripard Acked-by: Chen-Yu Tsai Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 40cd894e4df5e..29ab0985b46e1 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = { static const struct regulator_linear_range axp20x_ldo4_ranges[] = { REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000), - REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000), + REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0), + REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000), + REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000), }; static const struct regulator_desc axp20x_regulators[] = { -- GitLab From e1ce726e1db2522b4848b3acffb7ece12439517c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 26 Apr 2016 18:02:42 +0900 Subject: [PATCH 353/705] perf tools: Add lsdir() helper to read a directory As a utility function, add lsdir() which reads given directory and store entry name into a strlist. lsdir accepts a filter function so that user can filter out unneeded entries. Signed-off-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hemant Kumar Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20160426090242.11891.79014.stgit@devbox [ Do not use the 'dirname' it is used in some distros ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.c | 34 ++++++++++++++++++++++++++++++++++ tools/perf/util/util.h | 3 +++ 2 files changed, 37 insertions(+) diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index b7766c577b015..9473d46c00bba 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -117,6 +117,40 @@ int rm_rf(char *path) return rmdir(path); } +/* A filter which removes dot files */ +bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d) +{ + return d->d_name[0] != '.'; +} + +/* lsdir reads a directory and store it in strlist */ +struct strlist *lsdir(const char *name, + bool (*filter)(const char *, struct dirent *)) +{ + struct strlist *list = NULL; + DIR *dir; + struct dirent *d; + + dir = opendir(name); + if (!dir) + return NULL; + + list = strlist__new(NULL, NULL); + if (!list) { + errno = -ENOMEM; + goto out; + } + + while ((d = readdir(dir)) != NULL) { + if (!filter || filter(name, d)) + strlist__add(list, d->d_name); + } + +out: + closedir(dir); + return list; +} + static int slow_copyfile(const char *from, const char *to) { int err = -1; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 3bf3de86d4297..26a924651e7be 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -79,6 +79,7 @@ #include #include #include +#include "strlist.h" extern const char *graph_line; extern const char *graph_dotted_line; @@ -222,6 +223,8 @@ static inline int sane_case(int x, int high) int mkdir_p(char *path, mode_t mode); int rm_rf(char *path); +struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *)); +bool lsdir_no_dot_filter(const char *name, struct dirent *d); int copyfile(const char *from, const char *to); int copyfile_mode(const char *from, const char *to, mode_t mode); int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size); -- GitLab From 6ed0720a74d90eea51284c07592369d45d56f1f7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 26 Apr 2016 18:03:04 +0900 Subject: [PATCH 354/705] perf probe: Let probe_file__add_event return 0 if succeeded Since other methods return 0 if succeeded (or filedesc), let probe_file__add_event() return 0 instead of the length of written bytes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hemant Kumar Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20160426090303.11891.18232.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index e3b3b92e44587..3fe6214970e63 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -220,8 +220,7 @@ int probe_file__add_event(int fd, struct probe_trace_event *tev) pr_debug("Writing event: %s\n", buf); if (!probe_event_dry_run) { - ret = write(fd, buf, strlen(buf)); - if (ret <= 0) { + if (write(fd, buf, strlen(buf)) < (int)strlen(buf)) { ret = -errno; pr_warning("Failed to write event: %s\n", strerror_r(errno, sbuf, sizeof(sbuf))); -- GitLab From 2a12ec13cc4ca8690ad2690c09bf8bff17c228d9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 26 Apr 2016 18:04:13 +0900 Subject: [PATCH 355/705] perf probe: Set default kprobe group name if it is not given Set kprobe group name as "probe" if it is not given. Signed-off-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hemant Kumar Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20160426090413.11891.95640.stgit@devbox Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 97b7f8e5fe69e..0de5d10dda71c 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2748,9 +2748,13 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, { int ret; - if (pev->uprobes && !pev->group) { - /* Replace group name if not given */ - ret = convert_exec_to_group(pev->target, &pev->group); + if (!pev->group) { + /* Set group name if not given */ + if (!pev->uprobes) { + pev->group = strdup(PERFPROBE_GROUP); + ret = pev->group ? 0 : -ENOMEM; + } else + ret = convert_exec_to_group(pev->target, &pev->group); if (ret != 0) { pr_warning("Failed to make a group name.\n"); return ret; -- GitLab From 62de344e4fed23a42048c02dfa841f7d7ce9e88e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 11:03:03 -0300 Subject: [PATCH 356/705] perf trace: Move perf_flags beautifier to tools/perf/trace/beauty/ To reduce the size of builtin-trace.c. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-8r3gmymyn3r0ynt4yuzspp9g@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 45 +---------------------- tools/perf/trace/beauty/perf_event_open.c | 43 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 44 deletions(-) create mode 100644 tools/perf/trace/beauty/perf_event_open.c diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a4b133fac82ba..903deda92d9e8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -56,22 +56,6 @@ # define MSG_CMSG_CLOEXEC 0x40000000 #endif -#ifndef PERF_FLAG_FD_NO_GROUP -# define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#endif - -#ifndef PERF_FLAG_FD_OUTPUT -# define PERF_FLAG_FD_OUTPUT (1UL << 1) -#endif - -#ifndef PERF_FLAG_PID_CGROUP -# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#endif - -#ifndef PERF_FLAG_FD_CLOEXEC -# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ -#endif - struct trace { struct perf_tool tool; struct syscalltbl *sctbl; @@ -674,34 +658,6 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags -static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (flags == 0) - return 0; - -#define P_FLAG(n) \ - if (flags & PERF_FLAG_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~PERF_FLAG_##n; \ - } - - P_FLAG(FD_NO_GROUP); - P_FLAG(FD_OUTPUT); - P_FLAG(PID_CGROUP); - P_FLAG(FD_CLOEXEC); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags - static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -894,6 +850,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #include "trace/beauty/pid.c" #include "trace/beauty/mmap.c" #include "trace/beauty/mode_t.c" +#include "trace/beauty/perf_event_open.c" #include "trace/beauty/sched_policy.c" #include "trace/beauty/socket_type.c" #include "trace/beauty/waitid_options.c" diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c new file mode 100644 index 0000000000000..311f09dd718d0 --- /dev/null +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -0,0 +1,43 @@ +#ifndef PERF_FLAG_FD_NO_GROUP +# define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#endif + +#ifndef PERF_FLAG_FD_OUTPUT +# define PERF_FLAG_FD_OUTPUT (1UL << 1) +#endif + +#ifndef PERF_FLAG_PID_CGROUP +# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#endif + +#ifndef PERF_FLAG_FD_CLOEXEC +# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#endif + +static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (flags == 0) + return 0; + +#define P_FLAG(n) \ + if (flags & PERF_FLAG_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~PERF_FLAG_##n; \ + } + + P_FLAG(FD_NO_GROUP); + P_FLAG(FD_OUTPUT); + P_FLAG(PID_CGROUP); + P_FLAG(FD_CLOEXEC); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags -- GitLab From ccd9b2a7f82b069b8e8ac892fd9c1c22e7b11eba Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 11:40:17 -0300 Subject: [PATCH 357/705] perf trace: Do not beautify the 'pid' parameter as a simple integer Leave it alone so that it ends up assigned to SCA_PID via its type, 'pid_t', that will look up the pid on the machine thread rb_tree and possibly find its COMM. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-r7dujgmhtxxfajuunpt1bkuo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 903deda92d9e8..48b00f0425991 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1043,8 +1043,7 @@ static struct syscall_fmt { [1] = SCA_FILENAME, /* filename */ [2] = SCA_OPEN_FLAGS, /* flags */ }, }, { .name = "perf_event_open", .errmsg = true, - .arg_scnprintf = { [1] = SCA_INT, /* pid */ - [2] = SCA_INT, /* cpu */ + .arg_scnprintf = { [2] = SCA_INT, /* cpu */ [3] = SCA_FD, /* group_fd */ [4] = SCA_PERF_FLAGS, /* flags */ }, }, { .name = "pipe2", .errmsg = true, -- GitLab From 4bd112df3eea4db63fe90fb4e83c48d3f3bd6512 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 12:31:16 -0300 Subject: [PATCH 358/705] tools lib api fs: Add helper to read string from procfs file To read things like /proc/self/comm. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ztpkbmseidt0hq2psr46o0h9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 13 +++++++++++++ tools/lib/api/fs/fs.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index ef78c22ff44d4..08556cf2c70d4 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -351,6 +351,19 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep) return err; } +int procfs__read_str(const char *entry, char **buf, size_t *sizep) +{ + char path[PATH_MAX]; + const char *procfs = procfs__mountpoint(); + + if (!procfs) + return -1; + + snprintf(path, sizeof(path), "%s/%s", procfs, entry); + + return filename__read_str(path, buf, sizep); +} + int sysfs__read_ull(const char *entry, unsigned long long *value) { char path[PATH_MAX]; diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 9f6598098dc58..16c9c2ed7c5bf 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -29,6 +29,8 @@ int filename__read_int(const char *filename, int *value); int filename__read_ull(const char *filename, unsigned long long *value); int filename__read_str(const char *filename, char **buf, size_t *sizep); +int procfs__read_str(const char *entry, char **buf, size_t *sizep); + int sysctl__read_int(const char *sysctl, int *value); int sysfs__read_int(const char *entry, int *value); int sysfs__read_ull(const char *entry, unsigned long long *value); -- GitLab From 2f3027ac28bf6bc3ac7eb851eab06f2a38af5caa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 12:32:50 -0300 Subject: [PATCH 359/705] perf thread: Introduce method to set comm from /proc/pid/self Will be used for lazy comm loading in 'perf trace'. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-7ogbkuoka1y2qsmcckqxvl5m@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 19 +++++++++++++++++++ tools/perf/util/thread.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index de2036d1251b9..45fcb715a36b3 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -10,6 +10,8 @@ #include "comm.h" #include "unwind.h" +#include + int thread__init_map_groups(struct thread *thread, struct machine *machine) { struct thread *leader; @@ -153,6 +155,23 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, return 0; } +int thread__set_comm_from_proc(struct thread *thread) +{ + char path[64]; + char *comm = NULL; + size_t sz; + int err = -1; + + if (!(snprintf(path, sizeof(path), "%d/task/%d/comm", + thread->pid_, thread->tid) >= (int)sizeof(path)) && + procfs__read_str(path, &comm, &sz) == 0) { + comm[sz - 1] = '\0'; + err = thread__set_comm(thread, comm, 0); + } + + return err; +} + const char *thread__comm_str(const struct thread *thread) { const struct comm *comm = thread__comm(thread); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index e214207bb13ac..45fba13c800bd 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -71,6 +71,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm, return __thread__set_comm(thread, comm, timestamp, false); } +int thread__set_comm_from_proc(struct thread *thread); + int thread__comm_len(struct thread *thread); struct comm *thread__comm(const struct thread *thread); struct comm *thread__exec_comm(const struct thread *thread); -- GitLab From 073e5fca53d30ffe9e2fc637a001c78b2cdca7dd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 12:33:46 -0300 Subject: [PATCH 360/705] perf trace: Read thread's COMM from /proc when not set We get notifications for threads that gets created while we're tracing, but for preexisting threads we may end not having synthesized them, like when tracing a 'perf trace' session that will use '--pid' to trace some other thread. And besides we should probably stop synthesizing those records and instead read thread information in a lazy way, i.e. just when we need, like done in this patch: Now the 'pid_t' argument in 'perf_event_open' gets translated to a COMM: # perf trace -e perf_event_open perf stat -e cycles -p 31601 0.027 ( 0.027 ms): perf/23393 perf_event_open(attr_uptr: 0x2fdd0d8, pid: 31601 (abrt-dump-journ), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ = 3 ^C And in other syscalls containing pid_t without thread->comm_set at the time of the formatting. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ioeps6dlwst17d6oozc9shtk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/pid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c index 111ae08d38f10..07486ea65ae3c 100644 --- a/tools/perf/trace/beauty/pid.c +++ b/tools/perf/trace/beauty/pid.c @@ -3,9 +3,12 @@ static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_a int pid = arg->val; struct trace *trace = arg->trace; size_t printed = scnprintf(bf, size, "%d", pid); - struct thread *thread = machine__find_thread(trace->host, pid, pid); + struct thread *thread = machine__findnew_thread(trace->host, pid, pid); if (thread != NULL) { + if (!thread->comm_set) + thread__set_comm_from_proc(thread); + if (thread->comm_set) printed += scnprintf(bf + printed, size - printed, " (%s)", thread__comm_str(thread)); -- GitLab From 63a29613d7c69f17b6a3266bfc338986698b2546 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 26 Apr 2016 19:55:41 +0530 Subject: [PATCH 361/705] perf probe: Fix offline module name missmatch issue Perf can add a probe on kernel module which has not been loaded yet. The current implementation finds the module name from path. But if the filename is different from the actual module name then perf fails to register a probe while loading module because of mismatch in the names. For example, samples/kobject/kobject-example.ko is loaded as kobject_example. Before applying patch: $ sudo ./perf probe -m /linux/samples/kobject/kobject-example.ko foo_show Added new event: probe:foo_show (on foo_show in kobject-example) You can now use it in all perf tools, such as: perf record -e probe:foo_show -aR sleep 1 $ cat /sys/kernel/debug/tracing/kprobe_events p:probe/foo_show kobject-example:foo_show $ insmod kobject-example.ko $ lsmod Module Size Used by kobject_example 16384 0 Generate read to /sys/kernel/kobject_example/foo while recording data with below command $ sudo ./perf record -e probe:foo_show -a [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.093 MB perf.data ] $./perf report --stdio -F overhead,comm,dso,sym Error: The perf.data.old file has no samples! After applying patch: $ sudo ./perf probe -m /linux/samples/kobject/kobject-example.ko foo_show Added new event: probe:foo_show (on foo_show in kobject_example) You can now use it in all perf tools, such as: perf record -e probe:foo_show -aR sleep 1 $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/foo_show kobject_example:foo_show $ insmod kobject-example.ko $ lsmod Module Size Used by kobject_example 16384 0 Generate read to /sys/kernel/kobject_example/foo while recording data with below command $ sudo ./perf record -e probe:foo_show -a [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.097 MB perf.data (8 samples) ] $ sudo ./perf report --stdio -F overhead,comm,dso,sym ... # Samples: 8 of event 'probe:foo_show' # Event count (approx.): 8 # # Overhead Command Shared Object Symbol # ........ ....... ................. ............ # 100.00% cat [kobject_example] [k] foo_show Signed-off-by: Ravi Bangoria Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Wang Nan Link: http://lkml.kernel.org/r/1461680741-12517-2-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 0de5d10dda71c..bc2eb7cda2d16 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -588,32 +588,23 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, int ntevs, const char *module) { int i, ret = 0; - char *tmp; + char *mod_name = NULL; if (!module) return 0; - tmp = strrchr(module, '/'); - if (tmp) { - /* This is a module path -- get the module name */ - module = strdup(tmp + 1); - if (!module) - return -ENOMEM; - tmp = strchr(module, '.'); - if (tmp) - *tmp = '\0'; - tmp = (char *)module; /* For free() */ - } + mod_name = find_module_name(module); for (i = 0; i < ntevs; i++) { - tevs[i].point.module = strdup(module); + tevs[i].point.module = + strdup(mod_name ? mod_name : module); if (!tevs[i].point.module) { ret = -ENOMEM; break; } } - free(tmp); + free(mod_name); return ret; } -- GitLab From c61fb959df898b994382d586046d7704476ff503 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 26 Apr 2016 19:55:40 +0530 Subject: [PATCH 362/705] perf probe: Fix module probe issue if no dwarf support Perf is not able to register probe in kernel module when dwarf supprt is not there(and so it goes for symtab). Perf passes full path of module where only module name is required which is causing the problem. This patch fixes this issue. Before applying patch: $ dpkg -s libdw-dev dpkg-query: package 'libdw-dev' is not installed and no information is... $ sudo ./perf probe -m /linux/samples/kprobes/kprobe_example.ko kprobe_init Added new event: probe:kprobe_init (on kprobe_init in /linux/samples/kprobes/kprobe_example.ko) You can now use it in all perf tools, such as: perf record -e probe:kprobe_init -aR sleep 1 $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/kprobe_init /linux/samples/kprobes/kprobe_example.ko:kprobe_init $ sudo ./perf record -a -e probe:kprobe_init [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.105 MB perf.data ] $ sudo ./perf script # No output here After applying patch: $ sudo ./perf probe -m /linux/samples/kprobes/kprobe_example.ko kprobe_init Added new event: probe:kprobe_init (on kprobe_init in kprobe_example) You can now use it in all perf tools, such as: perf record -e probe:kprobe_init -aR sleep 1 $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/kprobe_init kprobe_example:kprobe_init $ sudo ./perf record -a -e probe:kprobe_init [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.105 MB perf.data (2 samples) ] $ sudo ./perf script insmod 13990 [002] 5961.216833: probe:kprobe_init: ... insmod 13995 [002] 5962.889384: probe:kprobe_init: ... Signed-off-by: Ravi Bangoria Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Wang Nan Link: http://lkml.kernel.org/r/1461680741-12517-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 76 +++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index bc2eb7cda2d16..a9774628c6f6a 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -265,6 +265,65 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address) return true; } +/* + * NOTE: + * '.gnu.linkonce.this_module' section of kernel module elf directly + * maps to 'struct module' from linux/module.h. This section contains + * actual module name which will be used by kernel after loading it. + * But, we cannot use 'struct module' here since linux/module.h is not + * exposed to user-space. Offset of 'name' has remained same from long + * time, so hardcoding it here. + */ +#ifdef __LP64__ +#define MOD_NAME_OFFSET 24 +#else +#define MOD_NAME_OFFSET 12 +#endif + +/* + * @module can be module name of module file path. In case of path, + * inspect elf and find out what is actual module name. + * Caller has to free mod_name after using it. + */ +static char *find_module_name(const char *module) +{ + int fd; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr; + Elf_Data *data; + Elf_Scn *sec; + char *mod_name = NULL; + + fd = open(module, O_RDONLY); + if (fd < 0) + return NULL; + + elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); + if (elf == NULL) + goto elf_err; + + if (gelf_getehdr(elf, &ehdr) == NULL) + goto ret_err; + + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".gnu.linkonce.this_module", NULL); + if (!sec) + goto ret_err; + + data = elf_getdata(sec, NULL); + if (!data || !data->d_buf) + goto ret_err; + + mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); + +ret_err: + elf_end(elf); +elf_err: + close(fd); + return mod_name; +} + #ifdef HAVE_DWARF_SUPPORT static int kernel_get_module_dso(const char *module, struct dso **pdso) @@ -2512,6 +2571,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, struct probe_trace_point *tp; int num_matched_functions; int ret, i, j, skipped = 0; + char *mod_name; map = get_target_map(pev->target, pev->uprobes); if (!map) { @@ -2596,9 +2656,19 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, tp->realname = strdup_or_goto(sym->name, nomem_out); tp->retprobe = pp->retprobe; - if (pev->target) - tev->point.module = strdup_or_goto(pev->target, - nomem_out); + if (pev->target) { + if (pev->uprobes) { + tev->point.module = strdup_or_goto(pev->target, + nomem_out); + } else { + mod_name = find_module_name(pev->target); + tev->point.module = + strdup(mod_name ? mod_name : pev->target); + free(mod_name); + if (!tev->point.module) + goto nomem_out; + } + } tev->uprobes = pev->uprobes; tev->nargs = pev->nargs; if (tev->nargs) { -- GitLab From 042a181086077bdb83e38626b35fb96adbe45039 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 12:58:45 -0300 Subject: [PATCH 363/705] perf tools: Update x86's syscall_64.tbl, adding preadv2 & pwritev2 Introduced in commit 4babf2c5efb7 ("x86: wire up preadv2 and pwritev2"). This will make 'perf trace' aware of them. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-vojoylgce2cetsy36446s5ny@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 2e5b565adacc5..cac6d17ce5db0 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -333,6 +333,8 @@ 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 326 common copy_file_range sys_copy_file_range +327 64 preadv2 sys_preadv2 +328 64 pwritev2 sys_pwritev2 # # x32-specific system call numbers start at 512 to avoid cache impact -- GitLab From c2a218c63ba36946aca5943c0c8ebd3a42e3dc4b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 26 Apr 2016 13:27:23 -0300 Subject: [PATCH 364/705] perf bench: Remove one more die() call Propagate the error instead. Cc: David Ahern Cc: Hitoshi Mitake Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-z6erjg35d1gekevwujoa0223@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/mem-functions.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index a91aa85d80ffc..2b54d0f2672a3 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -6,6 +6,7 @@ * Written by Hitoshi Mitake */ +#include "debug.h" #include "../perf.h" #include "../util/util.h" #include @@ -63,14 +64,16 @@ static struct perf_event_attr cycle_attr = { .config = PERF_COUNT_HW_CPU_CYCLES }; -static void init_cycles(void) +static int init_cycles(void) { cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag()); - if (cycles_fd < 0 && errno == ENOSYS) - die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - else - BUG_ON(cycles_fd < 0); + if (cycles_fd < 0 && errno == ENOSYS) { + pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); + return -1; + } + + return cycles_fd; } static u64 get_cycles(void) @@ -155,8 +158,13 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * argc = parse_options(argc, argv, options, info->usage, 0); - if (use_cycles) - init_cycles(); + if (use_cycles) { + i = init_cycles(); + if (i < 0) { + fprintf(stderr, "Failed to open cycles counter\n"); + return i; + } + } size = (size_t)perf_atoll((char *)size_str); size_total = (double)size * nr_loops; -- GitLab From c5dfd78eb79851e278b7973031b9ca363da87a7e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 21 Apr 2016 12:28:50 -0300 Subject: [PATCH 365/705] perf core: Allow setting up max frame stack depth via sysctl The default remains 127, which is good for most cases, and not even hit most of the time, but then for some cases, as reported by Brendan, 1024+ deep frames are appearing on the radar for things like groovy, ruby. And in some workloads putting a _lower_ cap on this may make sense. One that is per event still needs to be put in place tho. The new file is: # cat /proc/sys/kernel/perf_event_max_stack 127 Chaging it: # echo 256 > /proc/sys/kernel/perf_event_max_stack # cat /proc/sys/kernel/perf_event_max_stack 256 But as soon as there is some event using callchains we get: # echo 512 > /proc/sys/kernel/perf_event_max_stack -bash: echo: write error: Device or resource busy # Because we only allocate the callchain percpu data structures when there is a user, which allows for changing the max easily, its just a matter of having no callchain users at that point. Reported-and-Tested-by: Brendan Gregg Reviewed-by: Frederic Weisbecker Acked-by: Alexei Starovoitov Acked-by: David Ahern Cc: Adrian Hunter Cc: Alexander Shishkin Cc: He Kuang Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/r/20160426002928.GB16708@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/sysctl/kernel.txt | 14 ++++++++++++ arch/arm/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/perf_callchain.c | 4 ++-- arch/metag/kernel/perf_callchain.c | 2 +- arch/mips/kernel/perf_event.c | 4 ++-- arch/powerpc/perf/callchain.c | 4 ++-- arch/sparc/kernel/perf_event.c | 6 ++--- arch/x86/events/core.c | 4 ++-- arch/xtensa/kernel/perf_event.c | 4 ++-- include/linux/perf_event.h | 8 +++++-- kernel/bpf/stackmap.c | 8 +++---- kernel/events/callchain.c | 35 ++++++++++++++++++++++++++++-- kernel/sysctl.c | 12 ++++++++++ 13 files changed, 84 insertions(+), 23 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57653a44b128c..260cde08e92e8 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_warn - perf_cpu_time_max_percent - perf_event_paranoid +- perf_event_max_stack - pid_max - powersave-nap [ PPC only ] - printk @@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 1. ============================================================== +perf_event_max_stack: + +Controls maximum number of stack frames to copy for (attr.sample_type & +PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using +'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 127. + +============================================================== + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 4e02ae5950ff6..27563befa8a2d 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index ff4665462a025..32c3c6e70119f 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->nr < PERF_MAX_STACK_DEPTH && + while (entry->nr < sysctl_perf_event_max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index 315633461a945..252abc12a5a31 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) --frame; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame) + while ((entry->nr < sysctl_perf_event_max_stack) && frame) frame = user_backtrace(frame, entry); } diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index c1cf9c6c3f770..5021c546ad07d 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; } } @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } do { perf_callchain_store(entry, pc); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a6752b3999..22d9015c1acc8 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 6596f66ce1126..a4b8b5aed21c7 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } } #endif - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static inline int @@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static void perf_callchain_user_32(struct perf_callchain_entry *entry, @@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } void diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e280..41d93d0e972b3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c index 54f01188c29c1..a6b00b3af4299 100644 --- a/arch/xtensa/kernel/perf_event.c +++ b/arch/xtensa/kernel/perf_event.c @@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data) void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack, callchain_trace, NULL, entry); } void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_user(regs, sysctl_perf_event_max_stack, callchain_trace, entry); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 85749ae8cb5fa..a090700ccccaf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -58,7 +58,7 @@ struct perf_guest_info_callbacks { struct perf_callchain_entry { __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; + __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_raw_record { @@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); +extern int sysctl_perf_event_max_stack; + static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) { + if (entry->nr < sysctl_perf_event_max_stack) { entry->ip[entry->nr++] = ip; return 0; } else { @@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool perf_paranoid_tracepoint_raw(void) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 499d9e933f8e5..f5a19548be12e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || - value_size / 8 > PERF_MAX_STACK_DEPTH) + value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / 8; - /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ - u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; @@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) return -EFAULT; /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ trace_nr = trace->nr - init_nr; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 343c22f5e867d..b9325e7dcba10 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -18,6 +18,14 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; + +static inline size_t perf_callchain_entry__sizeof(void) +{ + return (sizeof(struct perf_callchain_entry) + + sizeof(__u64) * sysctl_perf_event_max_stack); +} + static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, @@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) cpu = smp_processor_id(); - return &entries->cpu_entries[cpu][*rctx]; + return (((void *)entries->cpu_entries[cpu]) + + (*rctx * perf_callchain_entry__sizeof())); } static void @@ -215,3 +224,25 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, return entry; } + +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int new_value = sysctl_perf_event_max_stack, ret; + struct ctl_table new_table = *table; + + new_table.data = &new_value; + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + mutex_lock(&callchain_mutex); + if (atomic_read(&nr_callchain_events)) + ret = -EBUSY; + else + sysctl_perf_event_max_stack = new_value; + + mutex_unlock(&callchain_mutex); + + return ret; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 725587f10667e..c8b318663525d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -130,6 +130,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_PERF_EVENTS +static int six_hundred_forty_kb = 640 * 1024; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "perf_event_max_stack", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &six_hundred_forty_kb, + }, #endif #ifdef CONFIG_KMEMCHECK { -- GitLab From 4cb93446c587d56e2a54f4f83113daba2c0b6dee Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 27 Apr 2016 10:16:24 -0300 Subject: [PATCH 366/705] perf tools: Set the maximum allowed stack from /proc/sys/kernel/perf_event_max_stack There is an upper limit to what tooling considers a valid callchain, and it was tied to the hardcoded value in the kernel, PERF_MAX_STACK_DEPTH (127), now that this can be tuned via a sysctl, make it read it and use that as the upper limit, falling back to PERF_MAX_STACK_DEPTH for kernels where this sysctl isn't present. Cc: Adrian Hunter Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-yjqsd30nnkogvj5oyx9ghir9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 2 +- tools/perf/Documentation/perf-script.txt | 2 +- tools/perf/Documentation/perf-top.txt | 2 +- tools/perf/Documentation/perf-trace.txt | 2 +- tools/perf/builtin-report.c | 4 ++-- tools/perf/builtin-script.c | 4 +++- tools/perf/builtin-top.c | 4 ++-- tools/perf/builtin-trace.c | 4 ++-- tools/perf/perf.c | 5 +++++ tools/perf/tests/hists_cumulate.c | 2 +- tools/perf/tests/hists_filter.c | 2 +- tools/perf/tests/hists_output.c | 2 +- tools/perf/util/machine.c | 6 +++--- tools/perf/util/scripting-engines/trace-event-perl.c | 2 +- tools/perf/util/util.c | 2 ++ tools/perf/util/util.h | 1 + 16 files changed, 28 insertions(+), 18 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 496d42cdf02b1..ebaf849e30efd 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -248,7 +248,7 @@ OPTIONS Note that when using the --itrace option the synthesized callchain size will override this value if the synthesized callchain size is bigger. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. -G:: --inverted:: diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 4fc44c75263fd..a856a1095893c 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -267,7 +267,7 @@ include::itrace.txt[] Note that when using the --itrace option the synthesized callchain size will override this value if the synthesized callchain size is bigger. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. --ns:: Use 9 decimal places when displaying time (i.e. show the nanoseconds) diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 19f046f027cd8..91d638df3a6bb 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -177,7 +177,7 @@ Default is to monitor all CPUS. between information loss and faster processing especially for workloads that can have a very long callchain stack. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. --ignore-callees=:: Ignore callees of the function(s) matching the given regex. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index c075c002eaa40..6afe20121bc06 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -143,7 +143,7 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. Implies '--call-graph dwarf' when --call-graph not present on the command line, on systems where DWARF unwinding was built in. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. --min-stack:: Set the stack depth limit when parsing the callchain, anything diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1d5be0bd426f7..8d9b88af901dd 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -691,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) .ordered_events = true, .ordering_requires_timestamps = true, }, - .max_stack = PERF_MAX_STACK_DEPTH, + .max_stack = sysctl_perf_event_max_stack, .pretty_printing_style = "normal", .socket_filter = -1, }; @@ -744,7 +744,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) OPT_INTEGER(0, "max-stack", &report.max_stack, "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_BOOLEAN('G', "inverted", &report.inverted_callchain, "alias for inverted call graph"), OPT_CALLBACK(0, "ignore-callees", NULL, "regex", diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index f43b0c6f88f45..efca81679bb31 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2031,7 +2031,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) OPT_UINTEGER(0, "max-stack", &scripting_max_stack, "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path, @@ -2067,6 +2067,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) NULL }; + scripting_max_stack = sysctl_perf_event_max_stack; + setup_scripting(); argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c130a11d3a0d7..da18517b1d400 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1103,7 +1103,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) }, .proc_map_timeout = 500, }, - .max_stack = PERF_MAX_STACK_DEPTH, + .max_stack = sysctl_perf_event_max_stack, .sym_pcnt_filter = 5, }; struct record_opts *opts = &top.record_opts; @@ -1171,7 +1171,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "Accumulate callchains of children and show total overhead as well"), OPT_INTEGER(0, "max-stack", &top.max_stack, "Set the maximum stack depth when parsing the callchain. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_CALLBACK(0, "ignore-callees", NULL, "regex", "ignore callees of these functions in call graphs", report_parse_ignore_callees_opt), diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 48b00f0425991..f4f3389c92c7b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3106,7 +3106,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) OPT_UINTEGER(0, "max-stack", &trace.max_stack, "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), OPT_END() @@ -3150,7 +3150,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) mmap_pages_user_set = false; if (trace.max_stack == UINT_MAX) { - trace.max_stack = PERF_MAX_STACK_DEPTH; + trace.max_stack = sysctl_perf_event_max_stack; max_stack_user_set = false; } diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 7b2df2b46525f..83ffe7cd73301 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -17,6 +17,7 @@ #include #include "util/bpf-loader.h" #include "util/debug.h" +#include #include #include #include @@ -533,6 +534,7 @@ int main(int argc, const char **argv) { const char *cmd; char sbuf[STRERR_BUFSIZE]; + int value; /* libsubcmd init */ exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); @@ -542,6 +544,9 @@ int main(int argc, const char **argv) page_size = sysconf(_SC_PAGE_SIZE); cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0) + sysctl_perf_event_max_stack = value; + cmd = extract_argv0_path(argv[0]); if (!cmd) cmd = "perf-help"; diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index ed5aa9eaeb6cf..4a2bbff9b1ee6 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -101,7 +101,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (machine__resolve(machine, &al, &sample) < 0) goto out; - if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH, + if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c index b825d24f81866..e846f8c420136 100644 --- a/tools/perf/tests/hists_filter.c +++ b/tools/perf/tests/hists_filter.c @@ -81,7 +81,7 @@ static int add_hist_entries(struct perf_evlist *evlist, al.socket = fake_samples[i].socket; if (hist_entry_iter__add(&iter, &al, - PERF_MAX_STACK_DEPTH, NULL) < 0) { + sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; } diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index d3556fbe8c5ca..7cd8738e842f0 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -67,7 +67,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (machine__resolve(machine, &al, &sample) < 0) goto out; - if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH, + if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 656c1d7ee7d46..2cb95bbf9ea67 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1764,7 +1764,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, */ int mix_chain_nr = i + 1 + lbr_nr + 1; - if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) { + if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) { pr_warning("corrupted callchain. skipping...\n"); return 0; } @@ -1825,7 +1825,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, * Based on DWARF debug information, some architectures skip * a callchain entry saved by the kernel. */ - if (chain->nr < PERF_MAX_STACK_DEPTH) + if (chain->nr < sysctl_perf_event_max_stack) skip_idx = arch_skip_callchain_idx(thread, chain); /* @@ -1886,7 +1886,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, } check_calls: - if (chain->nr > PERF_MAX_STACK_DEPTH && (int)chain->nr > max_stack) { + if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) { pr_warning("corrupted callchain. skipping...\n"); return 0; } diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index ae1cebc307c5b..62c7f6988e0e5 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -265,7 +265,7 @@ static SV *perl_process_callchain(struct perf_sample *sample, if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, - PERF_MAX_STACK_DEPTH) != 0) { + sysctl_perf_event_max_stack) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); goto exit; } diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 9473d46c00bba..619ba2061b62f 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -33,6 +33,8 @@ struct callchain_param callchain_param = { unsigned int page_size; int cacheline_size; +unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH; + bool test_attr__enabled; bool perf_host = true; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 26a924651e7be..88f607af1f470 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -267,6 +267,7 @@ void sighandler_dump_stack(int sig); extern unsigned int page_size; extern int cacheline_size; +extern unsigned int sysctl_perf_event_max_stack; struct parse_tag { char tag; -- GitLab From a2262e5a12e05389ab4c7fc5cf60016b041dd8dc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 27 Apr 2016 15:59:27 +0200 Subject: [PATCH 367/705] regulator: axp20x: Fix axp22x ldo_io voltage ranges The minium voltage of 1800mV is a copy and paste error from the axp20x regulator info. The correct minimum voltage for the ldo_io regulators on the axp22x is 700mV. Fixes: 1b82b4e4f954 ("regulator: axp20x: Add support for AXP22X regulators") Signed-off-by: Hans de Goede Acked-by: Chen-Yu Tsai Signed-off-by: Mark Brown --- drivers/regulator/axp20x-regulator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 29ab0985b46e1..89f6842956571 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -217,10 +217,10 @@ static const struct regulator_desc axp22x_regulators[] = { AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)), AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100, AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)), - AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100, + AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3300, 100, AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), - AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100, + AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3300, 100, AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000), -- GitLab From 3521ba1cc351e80488c3f85748c92c3853b75818 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 17 Apr 2016 15:03:01 -0700 Subject: [PATCH 368/705] powercap, perf/x86/intel/rapl: Add PSys support Skylake processor supports a new set of RAPL registers for controlling entire SoC instead of just CPU package. This is useful for thermal and power control when source of power/thermal is not just CPU/GPU. This change adds a new platform domain (AKA PSys) to the current power capping Intel RAPL driver. PSys also supports PL1 (long term) and PL2 (short term) control like package domain. This also follows same MSRs for energy and time units as package domain. Unlike package domain, PSys support requires more than just processor level implementation. The other parts in the system need additional implementation, which OEMs needs to support. So not all Skylake systems will support PSys. Signed-off-by: Srinivas Pandruvada Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Cc: hpa@zytor.com Cc: jacob.jun.pan@linux.intel.com Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1460930581-29748-3-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar --- drivers/powercap/intel_rapl.c | 69 +++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 8fad0a7044d3d..f2201d42a9cdd 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -34,6 +34,9 @@ #include #include +/* Local defines */ +#define MSR_PLATFORM_POWER_LIMIT 0x0000065C + /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff @@ -86,6 +89,7 @@ enum rapl_domain_type { RAPL_DOMAIN_PP0, /* core power plane */ RAPL_DOMAIN_PP1, /* graphics uncore */ RAPL_DOMAIN_DRAM,/* DRAM control_type */ + RAPL_DOMAIN_PLATFORM, /* PSys control_type */ RAPL_DOMAIN_MAX, }; @@ -251,9 +255,11 @@ static const char * const rapl_domain_names[] = { "core", "uncore", "dram", + "psys", }; static struct powercap_control_type *control_type; /* PowerCap Controller */ +static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */ /* caller to ensure CPU hotplug lock is held */ static struct rapl_package *find_package_by_id(int id) @@ -409,6 +415,14 @@ static const struct powercap_zone_ops zone_ops[] = { .set_enable = set_domain_enable, .get_enable = get_domain_enable, }, + /* RAPL_DOMAIN_PLATFORM */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, }; static int set_power_limit(struct powercap_zone *power_zone, int id, @@ -1160,6 +1174,13 @@ static int rapl_unregister_powercap(void) powercap_unregister_zone(control_type, &rd_package->power_zone); } + + if (platform_rapl_domain) { + powercap_unregister_zone(control_type, + &platform_rapl_domain->power_zone); + kfree(platform_rapl_domain); + } + powercap_unregister_control_type(control_type); return 0; @@ -1239,6 +1260,47 @@ static int rapl_package_register_powercap(struct rapl_package *rp) return ret; } +static int rapl_register_psys(void) +{ + struct rapl_domain *rd; + struct powercap_zone *power_zone; + u64 val; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val) + return -ENODEV; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val) + return -ENODEV; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; + rd->id = RAPL_DOMAIN_PLATFORM; + rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT; + rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + rd->rpl[1].prim_id = PL2_ENABLE; + rd->rpl[1].name = pl2_name; + rd->rp = find_package_by_id(0); + + power_zone = powercap_register_zone(&rd->power_zone, control_type, + "psys", NULL, + &zone_ops[RAPL_DOMAIN_PLATFORM], + 2, &constraint_ops); + + if (IS_ERR(power_zone)) { + kfree(rd); + return PTR_ERR(power_zone); + } + + platform_rapl_domain = rd; + + return 0; +} + static int rapl_register_powercap(void) { struct rapl_domain *rd; @@ -1255,6 +1317,10 @@ static int rapl_register_powercap(void) list_for_each_entry(rp, &rapl_packages, plist) if (rapl_package_register_powercap(rp)) goto err_cleanup_package; + + /* Don't bail out if PSys is not supported */ + rapl_register_psys(); + return ret; err_cleanup_package: @@ -1289,6 +1355,9 @@ static int rapl_check_domain(int cpu, int domain) case RAPL_DOMAIN_DRAM: msr = MSR_DRAM_ENERGY_STATUS; break; + case RAPL_DOMAIN_PLATFORM: + /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ + return -EINVAL; default: pr_err("invalid domain id %d\n", domain); return -EINVAL; -- GitLab From 594dd290cf5403a9a5818619dfff42d8e8e0518e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 22 Apr 2016 17:07:24 +0800 Subject: [PATCH 369/705] sched/cpufreq: Optimize cpufreq update kicker to avoid update multiple times Sometimes delta_exec is 0 due to update_curr() is called multiple times, this is captured by: u64 delta_exec = rq_clock_task(rq) - curr->se.exec_start; This patch optimizes the cpufreq update kicker by bailing out when nothing changed, it will benefit the upcoming schedutil, since otherwise it will (over)react to the special util/max combination. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1461316044-9520-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++---- kernel/sched/rt.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index affd97ec9f65a..8f9b5af4e857e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq) return; } + /* kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c41ea7ac1764b..19e13060fcd50 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); -- GitLab From 1d4093d3b3a70b947822cca76d6e4132767ce089 Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Mon, 25 Apr 2016 07:36:54 +0100 Subject: [PATCH 370/705] locking/Documentation/lockdep: Fix spelling mistakes Signed-off-by: Eric Engestrom Signed-off-by: Peter Zijlstra (Intel) Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1461566229-4717-2-git-send-email-eric@engestrom.ch Signed-off-by: Ingo Molnar --- Documentation/locking/lockdep-design.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index 5001280e9d824..9de1c158d44c7 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -97,7 +97,7 @@ between any two lock-classes: -> -> -The first rule comes from the fact the a hardirq-safe lock could be +The first rule comes from the fact that a hardirq-safe lock could be taken by a hardirq context, interrupting a hardirq-unsafe lock - and thus could result in a lock inversion deadlock. Likewise, a softirq-safe lock could be taken by an softirq context, interrupting a softirq-unsafe @@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value, when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. +don't have to validate the chain again. Troubleshooting: ---------------- -- GitLab From e7720af5f9ac914577e2b810d5c004cdf395fd82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Apr 2016 10:22:05 -0700 Subject: [PATCH 371/705] locking/Documentation: Add disclaimer It appears people are reading this document as a requirements list for building hardware. This is not the intent of this document. Nor is it particularly suited for this purpose. The primary purpose of this document is our collective attempt to define a set of primitives that (hopefully) allow us to write correct code on the myriad of SMP platforms Linux supports. Its a definite work in progress as our understanding of these platforms, and memory ordering in general, progresses. Nor does being mentioned in this document mean we think its a particularly good idea; the data dependency barrier required by Alpha being a prime example. Yes we have it, no you're insane to require it when building new hardware. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: dave@stgolabs.net Cc: dhowells@redhat.com Cc: linux-doc@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1461691328-5429-1-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index a9454b1c73bd4..fb2dd35a823a0 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -4,8 +4,24 @@ By: David Howells Paul E. McKenney + Will Deacon + Peter Zijlstra -Contents: +========== +DISCLAIMER +========== + +This document is not a specification; it is intentionally (for the sake of +brevity) and unintentionally (due to being human) incomplete. This document is +meant as a guide to using the various memory barriers provided by Linux, but +in case of any doubt (and there are many) please ask. + +To repeat, this document is not a specification of what Linux expects from +hardware. + +======== +CONTENTS +======== (*) Abstract memory access model. -- GitLab From 8d4840e84871847ee1bae56a776907d08a9265f7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Apr 2016 10:22:06 -0700 Subject: [PATCH 372/705] locking/Documentation: State purpose of memory-barriers.txt There has been some confusion about the purpose of memory-barriers.txt, so this commit adds a statement of purpose. Signed-off-by: David Howells Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: dave@stgolabs.net Cc: linux-doc@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1461691328-5429-2-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index fb2dd35a823a0..8b11e54238bf3 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -19,6 +19,22 @@ in case of any doubt (and there are many) please ask. To repeat, this document is not a specification of what Linux expects from hardware. +The purpose of this document is twofold: + + (1) to specify the minimum functionality that one can rely on for any + particular barrier, and + + (2) to provide a guide as to how to use the barriers that are available. + +Note that an architecture can provide more than the minimum requirement +for any particular barrier, but if the architecure provides less than +that, that architecture is incorrect. + +Note also that it is possible that a barrier may be a no-op for an +architecture because the way that arch works renders an explicit barrier +unnecessary in that case. + + ======== CONTENTS ======== -- GitLab From 3cfe2e8bc1cf74d78df6fe5ca3a1e1805472a004 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 Apr 2016 10:22:07 -0700 Subject: [PATCH 373/705] locking/Documentation: Clarify that ACQUIRE applies to loads, RELEASE applies to stores For compound atomics performing both a load and a store operation, make it clear that _acquire and _release variants refer only to the load and store portions of compound atomic. For example, xchg_acquire is an xchg operation where the load takes on ACQUIRE semantics. Signed-off-by: Will Deacon Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: dave@stgolabs.net Cc: dhowells@redhat.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/1461691328-5429-3-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 8b11e54238bf3..147ae8ec836f8 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -498,6 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. +A subset of the atomic operations described in atomic_ops.txt have ACQUIRE +and RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that -- GitLab From 5db4298133d99b3dfc60d6899ac9df169769c899 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Apr 2016 10:22:08 -0700 Subject: [PATCH 374/705] lcoking/locktorture: Simplify the torture_runnable computation This commit replaces an #ifdef with IS_ENABLED(), saving five lines. Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: corbet@lwn.net Cc: dave@stgolabs.net Cc: dhowells@redhat.com Cc: linux-doc@vger.kernel.org Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1461691328-5429-4-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/locktorture.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index d066a50dc87e6..f8c5af52a131f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); -- GitLab From 81b785f3e4114ed74fceb48a54e7de2f797a2ba1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 26 Apr 2016 14:46:06 -0700 Subject: [PATCH 375/705] x86/boot: Rename overlapping memcpy() to memmove() Instead of having non-standard memcpy() behavior, explicitly call the new function memmove(), make it available to the decompressors, and switch the two overlap cases (screen scrolling and ELF parsing) to use memmove(). Additionally documents the purpose of compressed/string.c. Suggested-by: Lasse Collin Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: H.J. Lu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20160426214606.GA5758@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 6 ++++-- arch/x86/boot/compressed/string.c | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c57d785ff9552..6dde6ccdf00eb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -32,9 +32,11 @@ #undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) +#define memmove memmove /* Functions used by the included decompressor code below. */ static void error(char *m); +void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time @@ -80,7 +82,7 @@ static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -307,7 +309,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, output + phdr->p_offset, phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 1e10e40f49dd5..2befeca1aada0 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,14 @@ +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ #include "../string.c" #ifdef CONFIG_X86_32 -void *__memcpy(void *dest, const void *src, size_t n) +void *memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +22,7 @@ void *__memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *__memcpy(void *dest, const void *src, size_t n) +void *memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -40,17 +47,13 @@ void *memset(void *s, int c, size_t n) return s; } -/* - * This memcpy is overlap safe (i.e. it is memmove without conflicting - * with other definitions of memmove from the various decompressors. - */ -void *memcpy(void *dest, const void *src, size_t n) +void *memmove(void *dest, const void *src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; if (d <= s || d - s >= n) - return __memcpy(dest, src, n); + return memcpy(dest, src, n); while (n-- > 0) d[n] = s[n]; -- GitLab From 88f10e37e150569a390be7a6161fa0f26b7372e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 09:39:05 -0700 Subject: [PATCH 376/705] sched/core, ARM: Include linux/preempt.h from asm/mmu_context.h arm's mmu_context.h uses preempt_enable_no_resched and but doesn't include anything that would pull in the declaration. If I start including from without this, the build breaks. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Catalin Marinas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b95730a70f2dafe12d4fbf38d20eb7330d67ba3.1461688545.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/mmu_context.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index fa5b42d44985f..ed73babc0dc91 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include -- GitLab From c5b591e96db9d99d0126acf93f24e1fb8b368343 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:33 +0100 Subject: [PATCH 377/705] efi: Get rid of the EFI_SYSTEM_TABLES status bit The EFI_SYSTEM_TABLES status bit is set by all EFI supporting architectures upon discovery of the EFI system table, but the bit is never tested in any code we have in the tree. So remove it. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Leif Lindholm Cc: Luck, Tony Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/ia64/kernel/efi.c | 2 -- arch/x86/platform/efi/efi.c | 2 -- drivers/firmware/efi/arm-runtime.c | 1 - include/linux/efi.h | 1 - 4 files changed, 6 deletions(-) diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 300dac3702f11..bf0865cd438a4 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -531,8 +531,6 @@ efi_init (void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - palo_phys = EFI_INVALID_TABLE_ADDR; if (efi_config_init(arch_tables) != 0) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 994a7df84a7bc..df393eab0e509 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -352,8 +352,6 @@ static int __init efi_systab_init(void *phys) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - return 0; } diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 6ae21e41a4294..16c7d2a711566 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -105,7 +105,6 @@ static int __init arm_enable_runtime_services(void) pr_err("Failed to remap EFI System Table\n"); return -ENOMEM; } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); if (!efi_virtmap_init()) { pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); diff --git a/include/linux/efi.h b/include/linux/efi.h index 1626474567ac5..1545098b05653 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1000,7 +1000,6 @@ extern int __init efi_setup_pcdp_console(char *); * possible, remove EFI-related code altogether. */ #define EFI_BOOT 0 /* Were we booted from EFI? */ -#define EFI_SYSTEM_TABLES 1 /* Can we use EFI system tables? */ #define EFI_CONFIG_TABLES 2 /* Can we use EFI config tables? */ #define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */ #define EFI_MEMMAP 4 /* Can we use EFI memory map? */ -- GitLab From 14c43be60166981f0b1f034ad9c59252c6f99e0d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:34 +0100 Subject: [PATCH 378/705] efi/arm*: Drop writable mapping of the UEFI System table Commit: 2eec5dedf770 ("efi/arm-init: Use read-only early mappings") updated the early ARM UEFI init code to create the temporary, early mapping of the UEFI System table using read-only attributes, as a hardening measure against inadvertent modification. However, this still leaves the permanent, writable mapping of the UEFI System table, which is only ever referenced during invocations of UEFI Runtime Services, at which time the UEFI virtual mapping is available, which also covers the system table. (This is guaranteed by the fact that SetVirtualAddressMap(), which is a runtime service itself, converts various entries in the table to their virtual equivalents, which implies that the table must be covered by a RuntimeServicesData region that has the EFI_MEMORY_RUNTIME attribute.) So instead of creating this permanent mapping, record the virtual address of the system table inside the UEFI virtual mapping, and dereference that when accessing the table. This protects the contents of the system table from inadvertent (or deliberate) modification when no UEFI Runtime Services calls are in progress. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-3-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/arm-init.c | 2 ++ drivers/firmware/efi/arm-runtime.c | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 8714f8c271bab..008ed1993b720 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -85,6 +85,8 @@ static int __init uefi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); + efi.runtime_version = efi.systab->hdr.revision; + /* Show what we know for posterity */ c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor), sizeof(vendor) * sizeof(efi_char16_t)); diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 16c7d2a711566..771750df6b7db 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -42,10 +42,12 @@ static struct mm_struct efi_mm = { static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + bool systab_found; efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); + systab_found = false; for_each_efi_memory_desc(&memmap, md) { phys_addr_t phys = md->phys_addr; int ret; @@ -64,8 +66,20 @@ static bool __init efi_virtmap_init(void) &phys, ret); return false; } + /* + * If this entry covers the address of the UEFI system table, + * calculate and record its virtual address. + */ + if (efi_system_table >= phys && + efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) { + efi.systab = (void *)(unsigned long)(efi_system_table - + phys + md->virt_addr); + systab_found = true; + } } - return true; + if (!systab_found) + pr_err("No virtual mapping found for the UEFI System Table\n"); + return systab_found; } /* @@ -99,15 +113,8 @@ static int __init arm_enable_runtime_services(void) memmap.map_end = memmap.map + mapsize; efi.memmap = &memmap; - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); - return -ENOMEM; - } - if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); return -ENOMEM; } @@ -115,8 +122,6 @@ static int __init arm_enable_runtime_services(void) efi_native_runtime_setup(); set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - efi.runtime_version = efi.systab->hdr.revision; - return 0; } early_initcall(arm_enable_runtime_services); -- GitLab From 7fc8442f2a8a77f40565b42c41e4f2d48b179a56 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 25 Apr 2016 21:06:35 +0100 Subject: [PATCH 379/705] x86/mm/pat: Document the (currently) EFI-only code path It's not at all obvious that populate_pgd() and friends are only executed when mapping EFI virtual memory regions or that no other pageattr callers pass a ->pgd value. Reported-by: Andy Lutomirski Signed-off-by: Matt Fleming Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-4-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf792..a1f0e1d0ddc24 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { - if (cpa->pgd) + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ return populate_pgd(cpa, vaddr); + } /* * Ignore all non primary paths. -- GitLab From 73a6492589c87cd56707c8ac19eec78236c2d576 Mon Sep 17 00:00:00 2001 From: Linn Crosetto Date: Mon, 25 Apr 2016 21:06:36 +0100 Subject: [PATCH 380/705] efi/arm64: Report unexpected errors when determining Secure Boot status Certain code in the boot path may require the ability to determine whether UEFI Secure Boot is definitely enabled, for example printing status to the console. Other code may need to know when UEFI Secure Boot is definitely disabled, for example restricting use of kernel parameters. If an unexpected error is returned from GetVariable() when querying the status of UEFI Secure Boot, return an error to the caller. This allows the caller to determine the definite state, and to take appropriate action if an expected error is returned. Signed-off-by: Linn Crosetto Signed-off-by: Matt Fleming Reviewed-by: Ard Biesheuvel Acked-by: Mark Rutland Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Roy Franz Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-5-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/libstub/arm-stub.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 414deb85c2e52..07f967c4c567f 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -20,7 +20,7 @@ bool __nokaslr; -static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) +static int efi_get_secureboot(efi_system_table_t *sys_table_arg) { static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; static efi_char16_t const var_name[] = { @@ -39,8 +39,12 @@ static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) return val; case EFI_NOT_FOUND: return 0; + case EFI_DEVICE_ERROR: + return -EIO; + case EFI_SECURITY_VIOLATION: + return -EACCES; default: - return 1; + return -EINVAL; } } @@ -185,6 +189,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID; unsigned long reserve_addr = 0; unsigned long reserve_size = 0; + int secure_boot = 0; /* Check if we were booted by the EFI firmware */ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) @@ -250,12 +255,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + secure_boot = efi_get_secureboot(sys_table); + if (secure_boot > 0) + pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + + if (secure_boot < 0) { + pr_efi_err(sys_table, + "could not determine UEFI Secure Boot status.\n"); + } + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. */ - if (efi_secureboot_enabled(sys_table)) { - pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) { + pr_efi(sys_table, "Ignoring DTB from command line.\n"); } else { status = handle_cmdline_files(sys_table, image, cmdline_ptr, "dtb=", -- GitLab From 30d7bf034c034995f34dae265d96247f7f12044e Mon Sep 17 00:00:00 2001 From: Linn Crosetto Date: Mon, 25 Apr 2016 21:06:37 +0100 Subject: [PATCH 381/705] efi/arm64: Check SetupMode when determining Secure Boot status According to the UEFI specification (version 2.5 Errata A, page 87): The platform firmware is operating in secure boot mode if the value of the SetupMode variable is 0 and the SecureBoot variable is set to 1. A platform cannot operate in secure boot mode if the SetupMode variable is set to 1. Check the value of the SetupMode variable when determining the state of Secure Boot. Plus also do minor cleanup, change sizeof() use to match kernel style guidelines. Signed-off-by: Linn Crosetto Signed-off-by: Matt Fleming Reviewed-by: Ard Biesheuvel Acked-by: Mark Rutland Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Roy Franz Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-6-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/libstub/arm-stub.c | 32 +++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 07f967c4c567f..128632508fc64 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -22,21 +22,39 @@ bool __nokaslr; static int efi_get_secureboot(efi_system_table_t *sys_table_arg) { - static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; - static efi_char16_t const var_name[] = { + static efi_char16_t const sb_var_name[] = { 'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 }; + static efi_char16_t const sm_var_name[] = { + 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 }; + efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID; efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable; - unsigned long size = sizeof(u8); - efi_status_t status; u8 val; + unsigned long size = sizeof(val); + efi_status_t status; - status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid, + status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid, NULL, &size, &val); + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 0) + return 0; + + status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid, + NULL, &size, &val); + + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 1) + return 0; + + return 1; + +out_efi_err: switch (status) { - case EFI_SUCCESS: - return val; case EFI_NOT_FOUND: return 0; case EFI_DEVICE_ERROR: -- GitLab From 78ce248faa3c46e24e9bd42db3ab3650659f16dd Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 25 Apr 2016 21:06:38 +0100 Subject: [PATCH 382/705] efi: Iterate over efi.memmap in for_each_efi_memory_desc() Most of the users of for_each_efi_memory_desc() are equally happy iterating over the EFI memory map in efi.memmap instead of 'memmap', since the former is usually a pointer to the latter. For those users that want to specify an EFI memory map other than efi.memmap, that can be done using for_each_efi_memory_desc_in_map(). One such example is in the libstub code where the firmware is queried directly for the memory map, it gets iterated over, and then freed. This change goes part of the way toward deleting the global 'memmap' variable, which is not universally available on all architectures (notably IA64) and is rather poorly named. Signed-off-by: Matt Fleming Reviewed-by: Ard Biesheuvel Cc: Borislav Petkov Cc: Leif Lindholm Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-7-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 43 ++++++------------- arch/x86/platform/efi/efi_64.c | 10 ++--- arch/x86/platform/efi/quirks.c | 10 ++--- drivers/firmware/efi/arm-init.c | 4 +- drivers/firmware/efi/arm-runtime.c | 2 +- drivers/firmware/efi/efi.c | 6 +-- drivers/firmware/efi/fake_mem.c | 3 +- .../firmware/efi/libstub/efi-stub-helper.c | 6 ++- include/linux/efi.h | 11 ++++- 9 files changed, 39 insertions(+), 56 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index df393eab0e509..6f499819a27f9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -119,11 +119,10 @@ void efi_get_time(struct timespec *now) void __init efi_find_mirror(void) { - void *p; + efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -146,10 +145,9 @@ void __init efi_find_mirror(void) static void __init do_add_efi_memmap(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; @@ -226,17 +224,13 @@ void __init efi_print_memmap(void) { #ifdef EFI_DEBUG efi_memory_desc_t *md; - void *p; - int i; + int i = 0; - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { + for_each_efi_memory_desc(md) { char buf[64]; - md = p; pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i, efi_md_typeattr_format(buf, sizeof(buf), md), + i++, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); @@ -550,12 +544,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable) void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - void *p; /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - + for_each_efi_memory_desc(md) { if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; @@ -602,12 +593,10 @@ void __init old_map_region(efi_memory_desc_t *md) /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { - void *p; efi_memory_desc_t *md, *prev_md = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { u64 prev_size; - md = p; if (!prev_md) { prev_md = md; @@ -650,15 +639,13 @@ static void __init save_runtime_map(void) { #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; - void *tmp, *p, *q = NULL; + void *tmp, *q = NULL; int count = 0; if (efi_enabled(EFI_OLD_MEMMAP)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME) || (md->type == EFI_BOOT_SERVICES_CODE) || (md->type == EFI_BOOT_SERVICES_DATA)) @@ -814,7 +801,6 @@ static void __init kexec_enter_virtual_mode(void) #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; unsigned int num_pages; - void *p; efi.systab = NULL; @@ -838,8 +824,7 @@ static void __init kexec_enter_virtual_mode(void) * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { efi_map_region_fixed(md); /* FIXME: add error handling */ get_systab_virt_addr(md); } @@ -1009,13 +994,11 @@ void __init efi_enter_virtual_mode(void) u32 efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 49e4dd4a1f582..6e7242be1c874 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -55,14 +55,12 @@ struct efi_scratch efi_scratch; static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; - void *p; if (!(__supported_pte_mask & _PAGE_NX)) return; /* Make EFI service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_BOOT_SERVICES_CODE) efi_set_executable(md, executable); @@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * Map all of RAM so that we can access arguments in the 1:1 * mapping when making EFI runtime calls. */ - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (md->type != EFI_CONVENTIONAL_MEMORY && md->type != EFI_LOADER_DATA && md->type != EFI_LOADER_CODE) @@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void) unsigned long pfn; pgd_t *pgd = efi_pgd; efi_memory_desc_t *md; - void *p; if (efi_enabled(EFI_OLD_MEMMAP)) { if (__supported_pte_mask & _PAGE_NX) @@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void) if (!efi_enabled(EFI_NX_PE_DATA)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { unsigned long pf = 0; - md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index ab50ada1d56e4..097cb09d917b1 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size) */ void __init efi_reserve_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; bool already_reserved; @@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 008ed1993b720..d5f6b0ca521e7 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -40,7 +40,7 @@ static phys_addr_t efi_to_phys(unsigned long addr) { efi_memory_desc_t *md; - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc_in_map(&memmap, md) { if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) @@ -145,7 +145,7 @@ static __init void reserve_regions(void) if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc_in_map(&memmap, md) { paddr = md->phys_addr; npages = md->num_pages; diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 771750df6b7db..1cfbfaf57a2d9 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -48,7 +48,7 @@ static bool __init efi_virtmap_init(void) init_new_context(NULL, &efi_mm); systab_found = false; - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { phys_addr_t phys = md->phys_addr; int ret; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3a69ed5ecfcb5..4b533ce733747 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -620,16 +620,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, */ u64 __weak efi_mem_attributes(unsigned long phys_addr) { - struct efi_memory_map *map; efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - map = efi.memmap; - for (p = map->map; p < map->map_end; p += map->desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index ed3a854950cca..f55b75b2e1f4c 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -68,8 +68,7 @@ void __init efi_fake_memmap(void) return; /* count up the number of EFI memory descriptor */ - for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) { - md = old; + for_each_efi_memory_desc(md) { start = md->phys_addr; end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 29ed2f9b218ca..3bd127f953151 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg) map.map_end = map.map + map_size; - for_each_efi_memory_desc(&map, md) - if (md->attribute & EFI_MEMORY_WB) + for_each_efi_memory_desc_in_map(&map, md) { + if (md->attribute & EFI_MEMORY_WB) { if (membase > md->phys_addr) membase = md->phys_addr; + } + } efi_call_early(free_pool, map.map); diff --git a/include/linux/efi.h b/include/linux/efi.h index 1545098b05653..17ef4471e6034 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -958,11 +958,20 @@ static inline void efi_fake_memmap(void) { } #endif /* Iterate through an efi_memory_map */ -#define for_each_efi_memory_desc(m, md) \ +#define for_each_efi_memory_desc_in_map(m, md) \ for ((md) = (m)->map; \ (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/** + * for_each_efi_memory_desc - iterate over descriptors in efi.memmap + * @md: the efi_memory_desc_t * iterator + * + * Once the loop finishes @md must not be accessed. + */ +#define for_each_efi_memory_desc(md) \ + for_each_efi_memory_desc_in_map(efi.memmap, md) + /* * Format an EFI memory descriptor's type and attributes to a user-provided * character buffer, as per snprintf(), and return the buffer. -- GitLab From 884f4f66ffd6ffe632f3a8be4e6d10a858afdc37 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 25 Apr 2016 21:06:39 +0100 Subject: [PATCH 383/705] efi: Remove global 'memmap' EFI memory map Abolish the poorly named EFI memory map, 'memmap'. It is shadowed by a bunch of local definitions in various files and having two ways to access the EFI memory map ('efi.memmap' vs. 'memmap') is rather confusing. Furthermore, IA64 doesn't even provide this global object, which has caused issues when trying to write generic EFI memmap code. Replace all occurrences with efi.memmap, and convert the remaining iterator code to use for_each_efi_mem_desc(). Signed-off-by: Matt Fleming Reviewed-by: Ard Biesheuvel Cc: Borislav Petkov Cc: Luck, Tony Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 84 +++++++++++++++++------------- drivers/firmware/efi/arm-init.c | 20 ++++--- drivers/firmware/efi/arm-runtime.c | 12 ++--- drivers/firmware/efi/efi.c | 2 +- drivers/firmware/efi/fake_mem.c | 40 +++++++------- include/linux/efi.h | 5 +- 6 files changed, 85 insertions(+), 78 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 6f499819a27f9..88d2fb2cb3ef5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -56,8 +56,6 @@ #define EFI_DEBUG -struct efi_memory_map memmap; - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -207,15 +205,13 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = pmap; - memmap.nr_map = e->efi_memmap_size / + efi.memmap.phys_map = pmap; + efi.memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; - memmap.desc_size = e->efi_memdesc_size; - memmap.desc_version = e->efi_memdesc_version; - - memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + efi.memmap.desc_size = e->efi_memdesc_size; + efi.memmap.desc_version = e->efi_memdesc_version; - efi.memmap = &memmap; + memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); return 0; } @@ -240,10 +236,14 @@ void __init efi_print_memmap(void) void __init efi_unmap_memmap(void) { + unsigned long size; + clear_bit(EFI_MEMMAP, &efi.flags); - if (memmap.map) { - early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; + + size = efi.memmap.nr_map * efi.memmap.desc_size; + if (efi.memmap.map) { + early_memunmap(efi.memmap.map, size); + efi.memmap.map = NULL; } } @@ -432,17 +432,22 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + unsigned long addr, size; + if (efi_enabled(EFI_PARAVIRT)) return 0; /* Map the EFI memory map */ - memmap.map = early_memremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) { + size = efi.memmap.nr_map * efi.memmap.desc_size; + addr = (unsigned long)efi.memmap.phys_map; + + efi.memmap.map = early_memremap(addr, size); + if (efi.memmap.map == NULL) { pr_err("Could not map the memory map!\n"); return -ENOMEM; } - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + + efi.memmap.map_end = efi.memmap.map + size; if (add_efi_memmap) do_add_efi_memmap(); @@ -638,6 +643,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { #ifdef CONFIG_KEXEC_CORE + unsigned long desc_size; efi_memory_desc_t *md; void *tmp, *q = NULL; int count = 0; @@ -645,21 +651,23 @@ static void __init save_runtime_map(void) if (efi_enabled(EFI_OLD_MEMMAP)) return; + desc_size = efi.memmap.desc_size; + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME) || (md->type == EFI_BOOT_SERVICES_CODE) || (md->type == EFI_BOOT_SERVICES_DATA)) continue; - tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); if (!tmp) goto out; q = tmp; - memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + memcpy(q + count * desc_size, md, desc_size); count++; } - efi_runtime_map_setup(q, count, memmap.desc_size); + efi_runtime_map_setup(q, count, desc_size); return; out: @@ -699,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry) { /* Initial call */ if (!entry) - return memmap.map_end - memmap.desc_size; + return efi.memmap.map_end - efi.memmap.desc_size; - entry -= memmap.desc_size; - if (entry < memmap.map) + entry -= efi.memmap.desc_size; + if (entry < efi.memmap.map) return NULL; return entry; @@ -744,10 +752,10 @@ static void *efi_map_next_entry(void *entry) /* Initial call */ if (!entry) - return memmap.map; + return efi.memmap.map; - entry += memmap.desc_size; - if (entry >= memmap.map_end) + entry += efi.memmap.desc_size; + if (entry >= efi.memmap.map_end) return NULL; return entry; @@ -761,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift) { void *p, *new_memmap = NULL; unsigned long left = 0; + unsigned long desc_size; efi_memory_desc_t *md; + desc_size = efi.memmap.desc_size; + p = NULL; while ((p = efi_map_next_entry(p))) { md = p; @@ -777,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) efi_map_region(md); get_systab_virt_addr(md); - if (left < memmap.desc_size) { + if (left < desc_size) { new_memmap = realloc_pages(new_memmap, *pg_shift); if (!new_memmap) return NULL; @@ -786,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) (*pg_shift)++; } - memcpy(new_memmap + (*count * memmap.desc_size), md, - memmap.desc_size); + memcpy(new_memmap + (*count * desc_size), md, desc_size); - left -= memmap.desc_size; + left -= desc_size; (*count)++; } @@ -833,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void) BUG_ON(!efi.systab); - num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE); + num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); num_pages >>= PAGE_SHIFT; - if (efi_setup_page_tables(memmap.phys_map, num_pages)) { + if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -920,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void) if (efi_is_native()) { status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index d5f6b0ca521e7..90a9b473e45c7 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -20,8 +20,6 @@ #include -struct efi_memory_map memmap; - u64 efi_system_table; static int __init is_normal_ram(efi_memory_desc_t *md) @@ -40,7 +38,7 @@ static phys_addr_t efi_to_phys(unsigned long addr) { efi_memory_desc_t *md; - for_each_efi_memory_desc_in_map(&memmap, md) { + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) @@ -145,7 +143,7 @@ static __init void reserve_regions(void) if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); - for_each_efi_memory_desc_in_map(&memmap, md) { + for_each_efi_memory_desc(md) { paddr = md->phys_addr; npages = md->num_pages; @@ -186,9 +184,9 @@ void __init efi_init(void) efi_system_table = params.system_table; - memmap.phys_map = params.mmap; - memmap.map = early_memremap_ro(params.mmap, params.mmap_size); - if (memmap.map == NULL) { + efi.memmap.phys_map = params.mmap; + efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size); + if (efi.memmap.map == NULL) { /* * If we are booting via UEFI, the UEFI memory map is the only * description of memory we have, so there is little point in @@ -196,15 +194,15 @@ void __init efi_init(void) */ panic("Unable to map EFI memory map.\n"); } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; + efi.memmap.map_end = efi.memmap.map + params.mmap_size; + efi.memmap.desc_size = params.desc_size; + efi.memmap.desc_version = params.desc_ver; if (uefi_init() < 0) return; reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); + early_memunmap(efi.memmap.map, params.mmap_size); if (IS_ENABLED(CONFIG_ARM)) { /* diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 1cfbfaf57a2d9..55a9ea0410684 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -103,15 +103,15 @@ static int __init arm_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { + mapsize = efi.memmap.map_end - efi.memmap.map; + + efi.memmap.map = (__force void *)ioremap_cache(efi.memmap.phys_map, + mapsize); + if (!efi.memmap.map) { pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; + efi.memmap.map_end = efi.memmap.map + mapsize; if (!efi_virtmap_init()) { pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 4b533ce733747..f7d36c6cc1ad9 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -256,7 +256,7 @@ subsys_initcall(efisubsys_init); */ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { - struct efi_memory_map *map = efi.memmap; + struct efi_memory_map *map = &efi.memmap; phys_addr_t p, e; if (!efi_enabled(EFI_MEMMAP)) { diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index f55b75b2e1f4c..48430aba13c18 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -57,7 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) void __init efi_fake_memmap(void) { u64 start, end, m_start, m_end, m_attr; - int new_nr_map = memmap.nr_map; + int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; void *new_memmap; @@ -94,25 +94,25 @@ void __init efi_fake_memmap(void) } /* allocate memory for new EFI memmap */ - new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map, + new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map, PAGE_SIZE); if (!new_memmap_phy) return; /* create new EFI memmap */ new_memmap = early_memremap(new_memmap_phy, - memmap.desc_size * new_nr_map); + efi.memmap.desc_size * new_nr_map); if (!new_memmap) { - memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map); + memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map); return; } - for (old = memmap.map, new = new_memmap; - old < memmap.map_end; - old += memmap.desc_size, new += memmap.desc_size) { + for (old = efi.memmap.map, new = new_memmap; + old < efi.memmap.map_end; + old += efi.memmap.desc_size, new += efi.memmap.desc_size) { /* copy original EFI memory descriptor */ - memcpy(new, old, memmap.desc_size); + memcpy(new, old, efi.memmap.desc_size); md = new; start = md->phys_addr; end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -133,8 +133,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_end - md->phys_addr + 1) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - md->phys_addr + 1) >> @@ -146,16 +146,16 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* middle part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->attribute |= m_attr; md->phys_addr = m_start; md->num_pages = (m_end - m_start + 1) >> EFI_PAGE_SHIFT; /* last part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - m_end) >> @@ -168,8 +168,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_start; md->num_pages = (end - md->phys_addr + 1) >> @@ -181,10 +181,10 @@ void __init efi_fake_memmap(void) /* swap into new EFI memmap */ efi_unmap_memmap(); - memmap.map = new_memmap; - memmap.phys_map = new_memmap_phy; - memmap.nr_map = new_nr_map; - memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size; + efi.memmap.map = new_memmap; + efi.memmap.phys_map = new_memmap_phy; + efi.memmap.nr_map = new_nr_map; + efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size; set_bit(EFI_MEMMAP, &efi.flags); /* print new EFI memmap */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 17ef4471e6034..c2c0da49876e0 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -883,7 +883,7 @@ extern struct efi { efi_get_next_high_mono_count_t *get_next_high_mono_count; efi_reset_system_t *reset_system; efi_set_virtual_address_map_t *set_virtual_address_map; - struct efi_memory_map *memmap; + struct efi_memory_map memmap; unsigned long flags; } efi; @@ -945,7 +945,6 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource, extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params); -extern struct efi_memory_map memmap; extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; @@ -970,7 +969,7 @@ static inline void efi_fake_memmap(void) { } * Once the loop finishes @md must not be accessed. */ #define for_each_efi_memory_desc(md) \ - for_each_efi_memory_desc_in_map(efi.memmap, md) + for_each_efi_memory_desc_in_map(&efi.memmap, md) /* * Format an EFI memory descriptor's type and attributes to a user-provided -- GitLab From 0d054ad96e97dcd8966e9333eabcc7a466672f70 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:40 +0100 Subject: [PATCH 384/705] efi: Check EFI_MEMORY_DESCRIPTOR version explicitly Our efi_memory_desc_t type is based on EFI_MEMORY_DESCRIPTOR version 1 in the UEFI spec. No version updates are expected, but since we are about to introduce support for new firmware tables that use the same descriptor type, it makes sense to at least warn if we encounter other versions. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-9-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 4 ++++ drivers/firmware/efi/arm-init.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 88d2fb2cb3ef5..dde46cd78b8f3 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -211,6 +211,10 @@ int __init efi_memblock_x86_reserve_range(void) efi.memmap.desc_size = e->efi_memdesc_size; efi.memmap.desc_version = e->efi_memdesc_version; + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); + memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); return 0; diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 90a9b473e45c7..a84dddb54c149 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -198,6 +198,10 @@ void __init efi_init(void) efi.memmap.desc_size = params.desc_size; efi.memmap.desc_version = params.desc_ver; + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); + if (uefi_init() < 0) return; -- GitLab From 24d45d1dc275b818093fe1d0055a230ce5e8c4c7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:41 +0100 Subject: [PATCH 385/705] efi/arm*: Use memremap() to create the persistent memmap mapping Instead of using ioremap_cache(), which is slightly inappropriate for mapping firmware tables, and is not even allowed on ARM for mapping regions that are covered by a struct page, use memremap(), which was invented for this purpose, and will also reuse the existing kernel direct mapping if the requested region is covered by it. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-10-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/arm-runtime.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 55a9ea0410684..19283deac375d 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -105,8 +105,7 @@ static int __init arm_enable_runtime_services(void) mapsize = efi.memmap.map_end - efi.memmap.map; - efi.memmap.map = (__force void *)ioremap_cache(efi.memmap.phys_map, - mapsize); + efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); if (!efi.memmap.map) { pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; -- GitLab From 9fc68b717c24a215a32c1b4e05b30433cafb2599 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:42 +0100 Subject: [PATCH 386/705] ARM/efi: Apply strict permissions for UEFI Runtime Services regions Recent UEFI versions expose permission attributes for runtime services memory regions, either in the UEFI memory map or in the separate memory attributes table. This allows the kernel to map these regions with stricter permissions, rather than the RWX permissions that are used by default. So wire this up in our mapping routine. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Catalin Marinas Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Russell King Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-11-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/arm/include/asm/efi.h | 1 + arch/arm/kernel/efi.c | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index e0eea72deb87e..b0c341d7ceee0 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -22,6 +22,7 @@ void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define efi_call_virt(f, ...) \ ({ \ diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ff8a9d8acfaca..9f43ba012d107 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -11,6 +11,41 @@ #include #include +static int __init set_permissions(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = *ptep; + + if (md->attribute & EFI_MEMORY_RO) + pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY)); + if (md->attribute & EFI_MEMORY_XP) + pte = set_pte_bit(pte, __pgprot(L_PTE_XN)); + set_pte_ext(ptep, pte, PTE_EXT_NG); + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + unsigned long base, size; + + base = md->virt_addr; + size = md->num_pages << EFI_PAGE_SHIFT; + + /* + * We can only use apply_to_page_range() if we can guarantee that the + * entire region was mapped using pages. This should be the case if the + * region does not cover any naturally aligned SECTION_SIZE sized + * blocks. + */ + if (round_down(base + size, SECTION_SIZE) < + round_up(base, SECTION_SIZE) + SECTION_SIZE) + return apply_to_page_range(mm, base, size, set_permissions, md); + + return 0; +} + int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { struct map_desc desc = { @@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) desc.type = MT_DEVICE; create_mapping_late(mm, &desc, true); + + /* + * If stricter permissions were specified, apply them now. + */ + if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP)) + return efi_set_mapping_permissions(mm, md); return 0; } -- GitLab From 1fd55a9a09b0293af95ab4299b108f030fef4464 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:43 +0100 Subject: [PATCH 387/705] arm64/efi: Apply strict permissions to UEFI Runtime Services regions Recent UEFI versions expose permission attributes for runtime services memory regions, either in the UEFI memory map or in the separate memory attributes table. This allows the kernel to map these regions with stricter permissions, rather than the RWX permissions that are used by default. So wire this up in our mapping routine. Note that in the absence of permission attributes, we still only map regions of type EFI_RUNTIME_SERVICE_CODE with the executable bit set. Also, we base the mapping attributes of EFI_MEMORY_MAPPED_IO on the type directly rather than on the absence of the EFI_MEMORY_WB attribute. This is more correct, but is also required for compatibility with the upcoming support for the Memory Attributes Table, which only carries permission attributes, not memory type attributes. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Catalin Marinas Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-12-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/arm64/kernel/efi.c | 54 ++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index b6abc852f2a14..33a6da160a501 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,22 +17,48 @@ #include -int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) { - pteval_t prot_val; + u64 attr = md->attribute; + u32 type = md->type; - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot_val = PROT_DEVICE_nGnRE; - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot_val = pgprot_val(PAGE_KERNEL_EXEC); - else - prot_val = pgprot_val(PAGE_KERNEL); + if (type == EFI_MEMORY_MAPPED_IO) + return PROT_DEVICE_nGnRE; + + if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), + "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + /* + * If the region is not aligned to the page size of the OS, we + * can not use strict permissions, since that would also affect + * the mapping attributes of the adjacent regions. + */ + return pgprot_val(PAGE_KERNEL_EXEC); + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return pgprot_val(PAGE_KERNEL_RO); + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return pgprot_val(PAGE_KERNEL_ROX); + + /* RW- */ + if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + return pgprot_val(PAGE_KERNEL); + + /* RWX */ + return pgprot_val(PAGE_KERNEL_EXEC); +} + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val = create_mapping_protection(md); create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, -- GitLab From a604af075a3226adaff84b7026876f0c6dfe9f52 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:44 +0100 Subject: [PATCH 388/705] efi: Add support for the EFI_MEMORY_ATTRIBUTES_TABLE config table This declares the GUID and struct typedef for the new memory attributes table which contains the permissions that can be used to apply stricter permissions to UEFI Runtime Services memory regions. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Catalin Marinas Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-13-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/efi.c | 2 ++ include/linux/efi.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index f7d36c6cc1ad9..583e647912a5d 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -43,6 +43,7 @@ struct efi __read_mostly efi = { .config_table = EFI_INVALID_TABLE_ADDR, .esrt = EFI_INVALID_TABLE_ADDR, .properties_table = EFI_INVALID_TABLE_ADDR, + .mem_attr_table = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = { {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga}, {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, {EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table}, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table}, {NULL_GUID, NULL, NULL}, }; diff --git a/include/linux/efi.h b/include/linux/efi.h index c2c0da49876e0..81af5feba1f76 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -623,6 +623,10 @@ void efi_native_runtime_setup(void); EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) +#define EFI_MEMORY_ATTRIBUTES_TABLE_GUID \ + EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \ + 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) + typedef struct { efi_guid_t guid; u64 table; @@ -847,6 +851,14 @@ typedef struct { #define EFI_INVALID_TABLE_ADDR (~0UL) +typedef struct { + u32 version; + u32 num_entries; + u32 desc_size; + u32 reserved; + efi_memory_desc_t entry[0]; +} efi_memory_attributes_table_t; + /* * All runtime access to EFI goes through this structure: */ @@ -868,6 +880,7 @@ extern struct efi { unsigned long config_table; /* config tables */ unsigned long esrt; /* ESRT table */ unsigned long properties_table; /* properties table */ + unsigned long mem_attr_table; /* memory attributes table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; -- GitLab From 10f0d2f57705350bbbe5f28e9292ae3905823c3c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:45 +0100 Subject: [PATCH 389/705] efi: Implement generic support for the Memory Attributes table This implements shared support for discovering the presence of the Memory Attributes table, and for parsing and validating its contents. The table is validated against the construction rules in the UEFI spec. Since this is a new table, it makes sense to complain if we encounter a table that does not follow those rules. The parsing and validation routine takes a callback that can be specified per architecture, that gets passed each unique validated region, with the virtual address retrieved from the ordinary memory map. Signed-off-by: Ard Biesheuvel [ Trim pr_*() strings to 80 cols and use EFI consistently. ] Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Catalin Marinas Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-14-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- drivers/firmware/efi/Makefile | 2 +- drivers/firmware/efi/memattr.c | 182 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 13 +++ 3 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/efi/memattr.c diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 62e654f255f4d..d5be623991308 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -9,7 +9,7 @@ # KASAN_SANITIZE_runtime-wrappers.o := n -obj-$(CONFIG_EFI) += efi.o vars.o reboot.o +obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c new file mode 100644 index 0000000000000..236004b9a50d3 --- /dev/null +++ b/drivers/firmware/efi/memattr.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2016 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "efi: memattr: " fmt + +#include +#include +#include +#include + +#include + +static int __initdata tbl_size; + +/* + * Reserve the memory associated with the Memory Attributes configuration + * table, if it exists. + */ +int __init efi_memattr_init(void) +{ + efi_memory_attributes_table_t *tbl; + + if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR) + return 0; + + tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl)); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (tbl->version > 1) { + pr_warn("Unexpected EFI Memory Attributes table version %d\n", + tbl->version); + goto unmap; + } + + tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size; + memblock_reserve(efi.mem_attr_table, tbl_size); + +unmap: + early_memunmap(tbl, sizeof(*tbl)); + return 0; +} + +/* + * Returns a copy @out of the UEFI memory descriptor @in if it is covered + * entirely by a UEFI memory map entry with matching attributes. The virtual + * address of @out is set according to the matching entry that was found. + */ +static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out) +{ + u64 in_paddr = in->phys_addr; + u64 in_size = in->num_pages << EFI_PAGE_SHIFT; + efi_memory_desc_t *md; + + *out = *in; + + if (in->type != EFI_RUNTIME_SERVICES_CODE && + in->type != EFI_RUNTIME_SERVICES_DATA) { + pr_warn("Entry type should be RuntimeServiceCode/Data\n"); + return false; + } + + if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) { + pr_warn("Entry attributes invalid: RO and XP bits both cleared\n"); + return false; + } + + if (PAGE_SIZE > EFI_PAGE_SIZE && + (!PAGE_ALIGNED(in->phys_addr) || + !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) { + /* + * Since arm64 may execute with page sizes of up to 64 KB, the + * UEFI spec mandates that RuntimeServices memory regions must + * be 64 KB aligned. We need to validate this here since we will + * not be able to tighten permissions on such regions without + * affecting adjacent regions. + */ + pr_warn("Entry address region misaligned\n"); + return false; + } + + for_each_efi_memory_desc(md) { + u64 md_paddr = md->phys_addr; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) { + /* no virtual mapping has been installed by the stub */ + break; + } + + if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size) + continue; + + /* + * This entry covers the start of @in, check whether + * it covers the end as well. + */ + if (md_paddr + md_size < in_paddr + in_size) { + pr_warn("Entry covers multiple EFI memory map regions\n"); + return false; + } + + if (md->type != in->type) { + pr_warn("Entry type deviates from EFI memory map region type\n"); + return false; + } + + out->virt_addr = in_paddr + (md->virt_addr - md_paddr); + + return true; + } + + pr_warn("No matching entry found in the EFI memory map\n"); + return false; +} + +/* + * To be called after the EFI page tables have been populated. If a memory + * attributes table is available, its contents will be used to update the + * mappings with tightened permissions as described by the table. + * This requires the UEFI memory map to have already been populated with + * virtual addresses. + */ +int __init efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn) +{ + efi_memory_attributes_table_t *tbl; + int i, ret; + + if (tbl_size <= sizeof(*tbl)) + return 0; + + /* + * We need the EFI memory map to be setup so we can use it to + * lookup the virtual addresses of all entries in the of EFI + * Memory Attributes table. If it isn't available, this + * function should not be called. + */ + if (WARN_ON(!efi_enabled(EFI_MEMMAP))) + return 0; + + tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI Memory Attributes table:\n"); + + for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) { + efi_memory_desc_t md; + unsigned long size; + bool valid; + char buf[64]; + + valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size, + &md); + size = md.num_pages << EFI_PAGE_SHIFT; + if (efi_enabled(EFI_DBG) || !valid) + pr_info("%s 0x%012llx-0x%012llx %s\n", + valid ? "" : "!", md.phys_addr, + md.phys_addr + size - 1, + efi_md_typeattr_format(buf, sizeof(buf), &md)); + + if (valid) + ret = fn(mm, &md); + } + memunmap(tbl); + return ret; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 81af5feba1f76..e29a31d0fc353 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -969,6 +969,19 @@ extern void __init efi_fake_memmap(void); static inline void efi_fake_memmap(void) { } #endif +/* + * efi_memattr_perm_setter - arch specific callback function passed into + * efi_memattr_apply_permissions() that updates the + * mapping permissions described by the second + * argument in the page tables referred to by the + * first argument. + */ +typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *); + +extern int efi_memattr_init(void); +extern int efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn); + /* Iterate through an efi_memory_map */ #define for_each_efi_memory_desc_in_map(m, md) \ for ((md) = (m)->map; \ -- GitLab From 789957ef72f976cb325e9057225fc4e9c4513060 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:46 +0100 Subject: [PATCH 390/705] efi/arm*: Take the Memory Attributes table into account Call into the generic memory attributes table support code at the appropriate times during the init sequence so that the UEFI Runtime Services region are mapped according to the strict permissions it specifies. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: Catalin Marinas Cc: Leif Lindholm Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-15-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/arm64/include/asm/efi.h | 2 ++ drivers/firmware/efi/arm-init.c | 1 + drivers/firmware/efi/arm-runtime.c | 10 ++++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cbc..4dafc89f373a0 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -14,6 +14,8 @@ extern void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +#define efi_set_mapping_permissions efi_create_mapping + #define efi_call_virt(f, ...) \ ({ \ efi_##f##_t *__f; \ diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index a84dddb54c149..909d974d35d9d 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -206,6 +206,7 @@ void __init efi_init(void) return; reserve_regions(); + efi_memattr_init(); early_memunmap(efi.memmap.map, params.mmap_size); if (IS_ENABLED(CONFIG_ARM)) { diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 19283deac375d..17ccf0a8787a2 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -77,9 +77,15 @@ static bool __init efi_virtmap_init(void) systab_found = true; } } - if (!systab_found) + if (!systab_found) { pr_err("No virtual mapping found for the UEFI System Table\n"); - return systab_found; + return false; + } + + if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) + return false; + + return true; } /* -- GitLab From c3c1c47f15b37a8492e630d1e9ab8ad576ee10e5 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 25 Apr 2016 21:06:47 +0100 Subject: [PATCH 391/705] x86/efi: Remove the always true EFI_DEBUG symbol This symbol is always set which makes it useless. Additionally we have a kernel command-line switch, efi=debug, which actually controls the printing of the memory map. Reported-by: Robert Elliott Signed-off-by: Matt Fleming Acked-by: Borislav Petkov Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-16-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index dde46cd78b8f3..f93545e7dc54e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,8 +54,6 @@ #include #include -#define EFI_DEBUG - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -222,7 +220,6 @@ int __init efi_memblock_x86_reserve_range(void) void __init efi_print_memmap(void) { -#ifdef EFI_DEBUG efi_memory_desc_t *md; int i = 0; @@ -235,7 +232,6 @@ void __init efi_print_memmap(void) md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -#endif /* EFI_DEBUG */ } void __init efi_unmap_memmap(void) -- GitLab From 2c23b73c2d0249c499c4784b6db08dcfc6b7b3b0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:48 +0100 Subject: [PATCH 392/705] x86/efi: Prepare GOP handling code for reuse as generic code In preparation of moving this code to drivers/firmware/efi and reusing it on ARM and arm64, apply any changes that will be required to make this code build for other architectures. This should make it easier to track down problems that this move may cause to its operation on x86. Note that the generic version uses slightly different ways of casting the protocol methods and some other variables to the correct types, since such method calls are not loosely typed on ARM and arm64 as they are on x86. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: David Herrmann Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-17-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/eboot.c | 58 +++++++++++++++++++------------- arch/x86/boot/compressed/eboot.h | 4 +++ arch/x86/include/asm/efi.h | 5 +++ include/linux/efi.h | 5 +++ 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 583d539a41977..10516e22fdcb0 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -622,19 +622,22 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, } static efi_status_t -__gop_query32(struct efi_graphics_output_protocol_32 *gop32, +__gop_query32(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_32 *gop32, struct efi_graphics_output_mode_info **info, unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_32 *mode; + efi_graphics_output_protocol_query_mode query_mode; efi_status_t status; unsigned long m; m = gop32->mode; mode = (struct efi_graphics_output_protocol_mode_32 *)m; + query_mode = (void *)(unsigned long)gop32->query_mode; - status = efi_early->call(gop32->query_mode, gop32, - mode->mode, size, info); + status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, + info); if (status != EFI_SUCCESS) return status; @@ -643,8 +646,8 @@ __gop_query32(struct efi_graphics_output_protocol_32 *gop32, } static efi_status_t -setup_gop32(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) +setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) { struct efi_graphics_output_protocol_32 *gop32, *first_gop; unsigned long nr_gops; @@ -654,7 +657,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; - efi_status_t status; + efi_status_t status = EFI_NOT_FOUND; u32 *handles = (u32 *)(unsigned long)gop_handle; int i; @@ -667,7 +670,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; bool conout_found = false; void *dummy = NULL; - u32 h = handles[i]; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; u64 current_fb_base; status = efi_call_early(handle_protocol, h, @@ -680,7 +683,8 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query32(gop32, &info, &size, ¤t_fb_base); + status = __gop_query32(sys_table_arg, gop32, &info, &size, + ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -735,19 +739,22 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, } static efi_status_t -__gop_query64(struct efi_graphics_output_protocol_64 *gop64, +__gop_query64(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_64 *gop64, struct efi_graphics_output_mode_info **info, unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_64 *mode; + efi_graphics_output_protocol_query_mode query_mode; efi_status_t status; unsigned long m; m = gop64->mode; mode = (struct efi_graphics_output_protocol_mode_64 *)m; + query_mode = (void *)(unsigned long)gop64->query_mode; - status = efi_early->call(gop64->query_mode, gop64, - mode->mode, size, info); + status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, + info); if (status != EFI_SUCCESS) return status; @@ -756,8 +763,8 @@ __gop_query64(struct efi_graphics_output_protocol_64 *gop64, } static efi_status_t -setup_gop64(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) +setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) { struct efi_graphics_output_protocol_64 *gop64, *first_gop; unsigned long nr_gops; @@ -767,7 +774,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; - efi_status_t status; + efi_status_t status = EFI_NOT_FOUND; u64 *handles = (u64 *)(unsigned long)gop_handle; int i; @@ -780,7 +787,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; bool conout_found = false; void *dummy = NULL; - u64 h = handles[i]; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; u64 current_fb_base; status = efi_call_early(handle_protocol, h, @@ -793,7 +800,8 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query64(gop64, &info, &size, ¤t_fb_base); + status = __gop_query64(sys_table_arg, gop64, &info, &size, + ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -850,8 +858,9 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, /* * See if we have Graphics Output Protocol */ -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size) +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size) { efi_status_t status; void **gop_handle = NULL; @@ -867,10 +876,13 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, if (status != EFI_SUCCESS) goto free_handle; - if (efi_early->is64) - status = setup_gop64(si, proto, size, gop_handle); - else - status = setup_gop32(si, proto, size, gop_handle); + if (efi_is_64bit()) { + status = setup_gop64(sys_table_arg, si, proto, size, + gop_handle); + } else { + status = setup_gop32(sys_table_arg, si, proto, size, + gop_handle); + } free_handle: efi_call_early(free_pool, gop_handle); @@ -1038,7 +1050,7 @@ void setup_graphics(struct boot_params *boot_params) EFI_LOCATE_BY_PROTOCOL, &graphics_proto, NULL, &size, gop_handle); if (status == EFI_BUFFER_TOO_SMALL) - status = setup_gop(si, &graphics_proto, size); + status = efi_setup_gop(NULL, si, &graphics_proto, size); if (status != EFI_SUCCESS) { size = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index d487e727f1ec7..4ee5318d7f280 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -85,6 +85,10 @@ struct efi_graphics_output_protocol { struct efi_graphics_output_protocol_mode *mode; }; +typedef efi_status_t (*efi_graphics_output_protocol_query_mode)( + struct efi_graphics_output_protocol *, u32, unsigned long *, + struct efi_graphics_output_mode_info **); + struct efi_uga_draw_protocol_32 { u32 get_mode; u32 set_mode; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 53748c45e4885..10e440770371e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -225,6 +225,11 @@ __pure const struct efi_config *__efi_early(void); #define efi_call_early(f, ...) \ __efi_early()->call(__efi_early()->f, __VA_ARGS__); +#define __efi_call_early(f, ...) \ + __efi_early()->call((unsigned long)f, __VA_ARGS__); + +#define efi_is_64bit() __efi_early()->is64 + extern bool efi_reboot_required(void); #else diff --git a/include/linux/efi.h b/include/linux/efi.h index e29a31d0fc353..c2949909339b7 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -1352,5 +1353,9 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size); + bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ -- GitLab From fc37206427ce38eafbeff48099d873235e878450 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:49 +0100 Subject: [PATCH 393/705] efi/libstub: Move Graphics Output Protocol handling to generic code The Graphics Output Protocol code executes in the stub, so create a generic version based on the x86 version in libstub so that we can move other archs to it in subsequent patches. The new source file gop.c is added to the libstub build for all architectures, but only wired up for x86. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Borislav Petkov Cc: David Herrmann Cc: Mark Rutland Cc: Peter Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-18-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/arm/include/asm/efi.h | 4 +- arch/arm64/include/asm/efi.h | 4 +- arch/x86/boot/compressed/eboot.c | 318 ----------------------- arch/x86/boot/compressed/eboot.h | 78 ------ drivers/firmware/efi/libstub/Makefile | 2 +- drivers/firmware/efi/libstub/gop.c | 354 ++++++++++++++++++++++++++ include/linux/efi.h | 81 +++++- 7 files changed, 441 insertions(+), 400 deletions(-) create mode 100644 drivers/firmware/efi/libstub/gop.c diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index b0c341d7ceee0..dc30d89a1ed34 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -60,7 +60,9 @@ void efi_virtmap_unload(void); /* arch specific definitions used by the stub code */ -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (false) /* * A reasonable upper bound for the uncompressed kernel size is 32 MBytes, diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 4dafc89f373a0..af40baa5d53f9 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -52,7 +52,9 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ #define MAX_FDT_OFFSET SZ_512M -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (true) #define EFI_ALLOC_ALIGN SZ_64K diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 10516e22fdcb0..52fef606bc542 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -571,324 +571,6 @@ static void setup_efi_pci(struct boot_params *params) efi_call_early(free_pool, pci_handle); } -static void -setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, - struct efi_pixel_bitmask pixel_info, int pixel_format) -{ - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; - } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; - } -} - -static efi_status_t -__gop_query32(efi_system_table_t *sys_table_arg, - struct efi_graphics_output_protocol_32 *gop32, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_32 *mode; - efi_graphics_output_protocol_query_mode query_mode; - efi_status_t status; - unsigned long m; - - m = gop32->mode; - mode = (struct efi_graphics_output_protocol_mode_32 *)m; - query_mode = (void *)(unsigned long)gop32->query_mode; - - status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, - info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, - efi_guid_t *proto, unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_32 *gop32, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status = EFI_NOT_FOUND; - u32 *handles = (u32 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop32 = NULL; - - nr_gops = size / sizeof(u32); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop32); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query32(sys_table_arg, gop32, &info, &size, - ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop32; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -static efi_status_t -__gop_query64(efi_system_table_t *sys_table_arg, - struct efi_graphics_output_protocol_64 *gop64, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_64 *mode; - efi_graphics_output_protocol_query_mode query_mode; - efi_status_t status; - unsigned long m; - - m = gop64->mode; - mode = (struct efi_graphics_output_protocol_mode_64 *)m; - query_mode = (void *)(unsigned long)gop64->query_mode; - - status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, - info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, - efi_guid_t *proto, unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_64 *gop64, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status = EFI_NOT_FOUND; - u64 *handles = (u64 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop64 = NULL; - - nr_gops = size / sizeof(u64); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop64); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query64(sys_table_arg, gop64, &info, &size, - ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop64; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -/* - * See if we have Graphics Output Protocol - */ -efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, - struct screen_info *si, efi_guid_t *proto, - unsigned long size) -{ - efi_status_t status; - void **gop_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&gop_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - proto, NULL, &size, gop_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - if (efi_is_64bit()) { - status = setup_gop64(sys_table_arg, si, proto, size, - gop_handle); - } else { - status = setup_gop32(sys_table_arg, si, proto, size, - gop_handle); - } - -free_handle: - efi_call_early(free_pool, gop_handle); - return status; -} - static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 4ee5318d7f280..c0223f1a89d71 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -11,84 +11,6 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -#define EFI_CONSOLE_OUT_DEVICE_GUID \ - EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ - 0x3f, 0xc1, 0x4d) - -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -#define PIXEL_BIT_MASK 2 -#define PIXEL_BLT_ONLY 3 -#define PIXEL_FORMAT_MAX 4 - -struct efi_pixel_bitmask { - u32 red_mask; - u32 green_mask; - u32 blue_mask; - u32 reserved_mask; -}; - -struct efi_graphics_output_mode_info { - u32 version; - u32 horizontal_resolution; - u32 vertical_resolution; - int pixel_format; - struct efi_pixel_bitmask pixel_information; - u32 pixels_per_scan_line; -} __packed; - -struct efi_graphics_output_protocol_mode_32 { - u32 max_mode; - u32 mode; - u32 info; - u32 size_of_info; - u64 frame_buffer_base; - u32 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode_64 { - u32 max_mode; - u32 mode; - u64 info; - u64 size_of_info; - u64 frame_buffer_base; - u64 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode { - u32 max_mode; - u32 mode; - unsigned long info; - unsigned long size_of_info; - u64 frame_buffer_base; - unsigned long frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_32 { - u32 query_mode; - u32 set_mode; - u32 blt; - u32 mode; -}; - -struct efi_graphics_output_protocol_64 { - u64 query_mode; - u64 set_mode; - u64 blt; - u64 mode; -}; - -struct efi_graphics_output_protocol { - void *query_mode; - unsigned long set_mode; - unsigned long blt; - struct efi_graphics_output_protocol_mode *mode; -}; - -typedef efi_status_t (*efi_graphics_output_protocol_query_mode)( - struct efi_graphics_output_protocol *, u32, unsigned long *, - struct efi_graphics_output_mode_info **); - struct efi_uga_draw_protocol_32 { u32 get_mode; u32 set_mode; diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index da99bbb74aebd..c06945160a415 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -lib-y := efi-stub-helper.o +lib-y := efi-stub-helper.o gop.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c new file mode 100644 index 0000000000000..932742e4cf231 --- /dev/null +++ b/drivers/firmware/efi/libstub/gop.c @@ -0,0 +1,354 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include +#include +#include +#include + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +static void +setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, + struct efi_pixel_bitmask pixel_info, int pixel_format) +{ + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } +} + +static efi_status_t +__gop_query32(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_32 *gop32, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_32 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop32->mode; + mode = (struct efi_graphics_output_protocol_mode_32 *)m; + query_mode = (void *)(unsigned long)gop32->query_mode; + + status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_32 *gop32, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u32 *handles = (u32 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop32 = NULL; + + nr_gops = size / sizeof(u32); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop32); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query32(sys_table_arg, gop32, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop32; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +static efi_status_t +__gop_query64(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_64 *gop64, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_64 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop64->mode; + mode = (struct efi_graphics_output_protocol_mode_64 *)m; + query_mode = (void *)(unsigned long)gop64->query_mode; + + status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_64 *gop64, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u64 *handles = (u64 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop64 = NULL; + + nr_gops = size / sizeof(u64); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop64); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query64(sys_table_arg, gop64, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop64; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +/* + * See if we have Graphics Output Protocol + */ +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + efi_status_t status; + void **gop_handle = NULL; + + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + proto, NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + if (efi_is_64bit()) { + status = setup_gop64(sys_table_arg, si, proto, size, + gop_handle); + } else { + status = setup_gop32(sys_table_arg, si, proto, size, + gop_handle); + } + +free_handle: + efi_call_early(free_pool, gop_handle); + return status; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index c2949909339b7..9203bbb28887c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -283,7 +283,8 @@ typedef struct { efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **); void *__reserved; void *register_protocol_notify; - void *locate_handle; + efi_status_t (*locate_handle)(int, efi_guid_t *, void *, + unsigned long *, efi_handle_t *); void *locate_device_path; void *install_configuration_table; void *load_image; @@ -628,6 +629,10 @@ void efi_native_runtime_setup(void); EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \ 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) +#define EFI_CONSOLE_OUT_DEVICE_GUID \ + EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, \ + 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) + typedef struct { efi_guid_t guid; u64 table; @@ -1214,6 +1219,80 @@ struct efi_simple_text_output_protocol { void *test_string; }; +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode_32 { + u32 max_mode; + u32 mode; + u32 info; + u32 size_of_info; + u64 frame_buffer_base; + u32 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode_64 { + u32 max_mode; + u32 mode; + u64 info; + u64 size_of_info; + u64 frame_buffer_base; + u64 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_32 { + u32 query_mode; + u32 set_mode; + u32 blt; + u32 mode; +}; + +struct efi_graphics_output_protocol_64 { + u64 query_mode; + u64 set_mode; + u64 blt; + u64 mode; +}; + +struct efi_graphics_output_protocol { + unsigned long query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +typedef efi_status_t (*efi_graphics_output_protocol_query_mode)( + struct efi_graphics_output_protocol *, u32, unsigned long *, + struct efi_graphics_output_mode_info **); + extern struct list_head efivar_sysfs_list; static inline void -- GitLab From 21289ec02b41c4b928a0b3de1778b325d714eea3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 25 Apr 2016 21:06:50 +0100 Subject: [PATCH 394/705] x86/efi/efifb: Move DMI based quirks handling out of generic code The efifb quirks handling based on DMI identification of the platform is specific to x86, so move it to x86 arch code. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming Acked-by: David Herrmann Acked-by: Peter Jones Cc: Borislav Petkov Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1461614832-17633-19-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 2 ++ arch/x86/kernel/sysfb_efi.c | 15 +++++++++++++++ drivers/video/fbdev/efifb.c | 15 ++++----------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 10e440770371e..8747abe8872f9 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -180,6 +180,8 @@ static inline bool efi_runtime_supported(void) extern struct console early_efi_console; extern void parse_efi_setup(u64 phys_addr, u32 data_len); +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); + #ifdef CONFIG_EFI_MIXED extern void efi_thunk_runtime_setup(void); extern efi_status_t efi_thunk_set_virtual_address_map( diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index b285d4e8c68e3..e21a8a7ddcffc 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = { [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } }; +void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ + int i; + + for (i = 0; i < M_UNKNOWN; i++) { + if (efifb_dmi_list[i].base != 0 && + !strcmp(opt, efifb_dmi_list[i].optname)) { + si->lfb_base = efifb_dmi_list[i].base; + si->lfb_linelength = efifb_dmi_list[i].stride; + si->lfb_width = efifb_dmi_list[i].width; + si->lfb_height = efifb_dmi_list[i].height; + } + } +} + #define choose_value(dmivalue, fwvalue, field, flags) ({ \ typeof(fwvalue) _ret_ = fwvalue; \ if ((flags) & (field)) \ diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 95d293b7445a8..dd594369b8a62 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -15,7 +16,7 @@ #include #include #include